Unverified Commit 1b792e71 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [python-package] enable ruff-format on tests and examples (#6317)

parent b60068c8
...@@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal ...@@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
def test_basic(tmp_path): def test_basic(tmp_path):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), X_train, X_test, y_train, y_test = train_test_split(
test_size=0.1, random_state=2) *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
feature_names = [f"Column_{i}" for i in range(X_train.shape[1])] feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
feature_names[1] = "a" * 1000 # set one name to a value longer than default buffer size feature_names[1] = "a" * 1000 # set one name to a value longer than default buffer size
train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names) train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
...@@ -34,7 +35,7 @@ def test_basic(tmp_path): ...@@ -34,7 +35,7 @@ def test_basic(tmp_path):
"verbose": -1, "verbose": -1,
"num_threads": 1, "num_threads": 1,
"max_bin": 255, "max_bin": 255,
"gpu_use_dp": True "gpu_use_dp": True,
} }
bst = lgb.Booster(params, train_data) bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1") bst.add_valid(valid_data, "valid_1")
...@@ -49,7 +50,7 @@ def test_basic(tmp_path): ...@@ -49,7 +50,7 @@ def test_basic(tmp_path):
assert bst.current_iteration() == 20 assert bst.current_iteration() == 20
assert bst.num_trees() == 20 assert bst.num_trees() == 20
assert bst.num_model_per_iteration() == 1 assert bst.num_model_per_iteration() == 1
if getenv('TASK', '') != 'cuda': if getenv("TASK", "") != "cuda":
assert bst.lower_bound() == pytest.approx(-2.9040190126976606) assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
assert bst.upper_bound() == pytest.approx(3.3182142872462883) assert bst.upper_bound() == pytest.approx(3.3182142872462883)
...@@ -79,20 +80,19 @@ def test_basic(tmp_path): ...@@ -79,20 +80,19 @@ def test_basic(tmp_path):
# test that shape is checked during prediction # test that shape is checked during prediction
bad_X_test = X_test[:, 1:] bad_X_test = X_test[:, 1:]
bad_shape_error_msg = "The number of features in data*" bad_shape_error_msg = "The number of features in data*"
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
bst.predict, bad_X_test) np.testing.assert_raises_regex(
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
bst.predict, sparse.csr_matrix(bad_X_test)) )
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, np.testing.assert_raises_regex(
bst.predict, sparse.csc_matrix(bad_X_test)) lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
)
with open(tname, "w+b") as f: with open(tname, "w+b") as f:
dump_svmlight_file(bad_X_test, y_test, f) dump_svmlight_file(bad_X_test, y_test, f)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
bst.predict, tname)
with open(tname, "w+b") as f: with open(tname, "w+b") as f:
dump_svmlight_file(X_test, y_test, f, zero_based=False) dump_svmlight_file(X_test, y_test, f, zero_based=False)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
bst.predict, tname)
class NumpySequence(lgb.Sequence): class NumpySequence(lgb.Sequence):
...@@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence): ...@@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence):
elif isinstance(idx, slice): elif isinstance(idx, slice):
if not (idx.step is None or idx.step == 1): if not (idx.step is None or idx.step == 1):
raise NotImplementedError("No need to implement, caller will not set step by now") raise NotImplementedError("No need to implement, caller will not set step by now")
return self.ndarray[idx.start:idx.stop] return self.ndarray[idx.start : idx.stop]
elif isinstance(idx, list): elif isinstance(idx, list):
return self.ndarray[idx] return self.ndarray[idx]
else: else:
...@@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size): ...@@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
return seqs return seqs
@pytest.mark.parametrize('sample_count', [11, 100, None]) @pytest.mark.parametrize("sample_count", [11, 100, None])
@pytest.mark.parametrize('batch_size', [3, None]) @pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize('include_0_and_nan', [False, True]) @pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize('num_seq', [1, 3]) @pytest.mark.parametrize("num_seq", [1, 3])
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
params = {'bin_construct_sample_cnt': sample_count} params = {"bin_construct_sample_cnt": sample_count}
nrow = 50 nrow = 50
half_nrow = nrow // 2 half_nrow = nrow // 2
...@@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): ...@@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
X = data[:, :-1] X = data[:, :-1]
Y = data[:, -1] Y = data[:, -1]
npy_bin_fname = tmpdir / 'data_from_npy.bin' npy_bin_fname = tmpdir / "data_from_npy.bin"
seq_bin_fname = tmpdir / 'data_from_seq.bin' seq_bin_fname = tmpdir / "data_from_seq.bin"
# Create dataset from numpy array directly. # Create dataset from numpy array directly.
ds = lgb.Dataset(X, label=Y, params=params) ds = lgb.Dataset(X, label=Y, params=params)
...@@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): ...@@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
valid_X = valid_data[:, :-1] valid_X = valid_data[:, :-1]
valid_Y = valid_data[:, -1] valid_Y = valid_data[:, -1]
valid_npy_bin_fname = tmpdir / 'valid_data_from_npy.bin' valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
valid_seq_bin_fname = tmpdir / 'valid_data_from_seq.bin' valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
valid_seq2_bin_fname = tmpdir / 'valid_data_from_seq2.bin' valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds) valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
valid_ds.save_binary(valid_npy_bin_fname) valid_ds.save_binary(valid_npy_bin_fname)
...@@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): ...@@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname) assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)
@pytest.mark.parametrize('num_seq', [1, 2]) @pytest.mark.parametrize("num_seq", [1, 2])
def test_sequence_get_data(num_seq): def test_sequence_get_data(num_seq):
nrow = 20 nrow = 20
ncol = 11 ncol = 11
...@@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq): ...@@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq):
def test_chunked_dataset(): def test_chunked_dataset():
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(
random_state=2) *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
chunk_size = X_train.shape[0] // 10 + 1 chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)] X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100}) train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100}) valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
...@@ -232,12 +233,13 @@ def test_chunked_dataset(): ...@@ -232,12 +233,13 @@ def test_chunked_dataset():
def test_chunked_dataset_linear(): def test_chunked_dataset_linear():
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(
random_state=2) *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
chunk_size = X_train.shape[0] // 10 + 1 chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)] X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)] X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
params = {"bin_construct_sample_cnt": 100, 'linear_tree': True} params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
train_data = lgb.Dataset(X_train, label=y_train, params=params) train_data = lgb.Dataset(X_train, label=y_train, params=params)
valid_data = train_data.create_valid(X_test, label=y_test, params=params) valid_data = train_data.create_valid(X_test, label=y_test, params=params)
train_data.construct() train_data.construct()
...@@ -246,16 +248,16 @@ def test_chunked_dataset_linear(): ...@@ -246,16 +248,16 @@ def test_chunked_dataset_linear():
def test_save_dataset_subset_and_load_from_file(tmp_path): def test_save_dataset_subset_and_load_from_file(tmp_path):
data = np.random.rand(100, 2) data = np.random.rand(100, 2)
params = {'max_bin': 50, 'min_data_in_bin': 10} params = {"max_bin": 50, "min_data_in_bin": 10}
ds = lgb.Dataset(data, params=params) ds = lgb.Dataset(data, params=params)
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / 'subset.bin') ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
lgb.Dataset(tmp_path / 'subset.bin', params=params).construct() lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
def test_subset_group(): def test_subset_group():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train) lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
assert len(lgb_train.get_group()) == 201 assert len(lgb_train.get_group()) == 201
subset = lgb_train.subset(list(range(10))).construct() subset = lgb_train.subset(list(range(10))).construct()
...@@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed(): ...@@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed():
def test_add_features_equal_data_on_alternating_used_unused(tmp_path): def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
X = np.random.random((100, 5)) X = np.random.random((100, 5))
X[:, [1, 3]] = 0 X[:, [1, 3]] = 0
names = [f'col_{i}' for i in range(5)] names = [f"col_{i}" for i in range(5)]
for j in range(1, 5): for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct() d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct() d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
...@@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path): ...@@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
d = lgb.Dataset(X, feature_name=names).construct() d = lgb.Dataset(X, feature_name=names).construct()
dname = tmp_path / "d.txt" dname = tmp_path / "d.txt"
d._dump_text(dname) d._dump_text(dname)
with open(d1name, 'rt') as d1f: with open(d1name, "rt") as d1f:
d1txt = d1f.read() d1txt = d1f.read()
with open(dname, 'rt') as df: with open(dname, "rt") as df:
dtxt = df.read() dtxt = df.read()
assert dtxt == d1txt assert dtxt == d1txt
...@@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path): ...@@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
def test_add_features_same_booster_behaviour(tmp_path): def test_add_features_same_booster_behaviour(tmp_path):
X = np.random.random((100, 5)) X = np.random.random((100, 5))
X[:, [1, 3]] = 0 X[:, [1, 3]] = 0
names = [f'col_{i}' for i in range(5)] names = [f"col_{i}" for i in range(5)]
for j in range(1, 5): for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct() d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct() d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
...@@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path): ...@@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path):
d1name = tmp_path / "d1.txt" d1name = tmp_path / "d1.txt"
b1.save_model(d1name) b1.save_model(d1name)
b.save_model(dname) b.save_model(dname)
with open(dname, 'rt') as df: with open(dname, "rt") as df:
dtxt = df.read() dtxt = df.read()
with open(d1name, 'rt') as d1f: with open(d1name, "rt") as d1f:
d1txt = d1f.read() d1txt = d1f.read()
assert dtxt == d1txt assert dtxt == d1txt
...@@ -345,11 +347,12 @@ def test_add_features_from_different_sources(): ...@@ -345,11 +347,12 @@ def test_add_features_from_different_sources():
n_col = 5 n_col = 5
X = np.random.random((n_row, n_col)) X = np.random.random((n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)] xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
names = [f'col_{i}' for i in range(n_col)] names = [f"col_{i}" for i in range(n_col)]
seq = _create_sequence_from_ndarray(X, 1, 30) seq = _create_sequence_from_ndarray(X, 1, 30)
seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct() seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
npy_list_ds = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]], npy_list_ds = lgb.Dataset(
feature_name=names, free_raw_data=False).construct() [X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
).construct()
immergeable_dds = [seq_ds, npy_list_ds] immergeable_dds = [seq_ds, npy_list_ds]
for x_1 in xxs: for x_1 in xxs:
# test that method works even with free_raw_data=True # test that method works even with free_raw_data=True
...@@ -373,20 +376,19 @@ def test_add_features_from_different_sources(): ...@@ -373,20 +376,19 @@ def test_add_features_from_different_sources():
d1.add_features_from(d2) d1.add_features_from(d2)
assert isinstance(d1.get_data(), original_type) assert isinstance(d1.get_data(), original_type)
assert d1.get_data().shape == (n_row, n_col * idx) assert d1.get_data().shape == (n_row, n_col * idx)
res_feature_names += [f'D{idx}_{name}' for name in names] res_feature_names += [f"D{idx}_{name}" for name in names]
assert d1.feature_name == res_feature_names assert d1.feature_name == res_feature_names
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys): def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
arr_a = np.zeros((100, 1), dtype=np.float32) arr_a = np.zeros((100, 1), dtype=np.float32)
arr_b = np.random.normal(size=(100, 5)) arr_b = np.random.normal(size=(100, 5))
dataset_a = lgb.Dataset(arr_a).construct() dataset_a = lgb.Dataset(arr_a).construct()
expected_msg = ( expected_msg = (
'[LightGBM] [Warning] There are no meaningful features which satisfy ' "[LightGBM] [Warning] There are no meaningful features which satisfy "
'the provided configuration. Decreasing Dataset parameters min_data_in_bin ' "the provided configuration. Decreasing Dataset parameters min_data_in_bin "
'or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n' "or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
) )
log_lines = capsys.readouterr().out log_lines = capsys.readouterr().out
assert expected_msg in log_lines assert expected_msg in log_lines
...@@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path): ...@@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path):
X = np.random.random((100, 5)) X = np.random.random((100, 5))
X[:, [1, 3]] = 0 X[:, [1, 3]] = 0
y = np.random.random(100) y = np.random.random(100)
names = [f'col_{i}' for i in range(5)] names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct() ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y) ds.set_label(y)
base = lgb.Booster(train_set=ds) base = lgb.Booster(train_set=ds)
...@@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path): ...@@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path):
base.update() base.update()
basename = tmp_path / "basename.txt" basename = tmp_path / "basename.txt"
base.save_model(basename) base.save_model(basename)
with open(basename, 'rt') as f: with open(basename, "rt") as f:
basetxt = f.read() basetxt = f.read()
# Set extremely harsh penalties, so CEGB will block most splits. # Set extremely harsh penalties, so CEGB will block most splits.
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]}, cases = [
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]}, {"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
{'cegb_penalty_split': 1}] {"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
{"cegb_penalty_split": 1},
]
for case in cases: for case in cases:
booster = lgb.Booster(train_set=ds, params=case) booster = lgb.Booster(train_set=ds, params=case)
for _ in range(10): for _ in range(10):
booster.update() booster.update()
casename = tmp_path / "casename.txt" casename = tmp_path / "casename.txt"
booster.save_model(casename) booster.save_model(casename)
with open(casename, 'rt') as f: with open(casename, "rt") as f:
casetxt = f.read() casetxt = f.read()
assert basetxt != casetxt assert basetxt != casetxt
...@@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path): ...@@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path):
X = np.random.random((100, 5)) X = np.random.random((100, 5))
X[:, [1, 3]] = 0 X[:, [1, 3]] = 0
y = np.random.random(100) y = np.random.random(100)
names = [f'col_{i}' for i in range(5)] names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct() ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y) ds.set_label(y)
# Compare pairs of penalties, to ensure scaling works as intended # Compare pairs of penalties, to ensure scaling works as intended
pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]}, pairs = [
{'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}), (
({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]}, {"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
{'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}), {"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
({'cegb_penalty_split': 1}, ),
{'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})] (
for (p1, p2) in pairs: {"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
{"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
),
({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
]
for p1, p2 in pairs:
booster1 = lgb.Booster(train_set=ds, params=p1) booster1 = lgb.Booster(train_set=ds, params=p1)
booster2 = lgb.Booster(train_set=ds, params=p2) booster2 = lgb.Booster(train_set=ds, params=p2)
for _ in range(10): for _ in range(10):
...@@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path): ...@@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path):
# Reset booster1's parameters to p2, so the parameter section of the file matches. # Reset booster1's parameters to p2, so the parameter section of the file matches.
booster1.reset_parameter(p2) booster1.reset_parameter(p2)
booster1.save_model(p1name) booster1.save_model(p1name)
with open(p1name, 'rt') as f: with open(p1name, "rt") as f:
p1txt = f.read() p1txt = f.read()
p2name = tmp_path / "p2.txt" p2name = tmp_path / "p2.txt"
booster2.save_model(p2name) booster2.save_model(p2name)
with open(p2name, 'rt') as f: with open(p2name, "rt") as f:
p2txt = f.read() p2txt = f.read()
assert p1txt == p2txt assert p1txt == p2txt
def test_consistent_state_for_dataset_fields(): def test_consistent_state_for_dataset_fields():
def check_asserts(data): def check_asserts(data):
np.testing.assert_allclose(data.label, data.get_label()) np.testing.assert_allclose(data.label, data.get_label())
np.testing.assert_allclose(data.label, data.get_field('label')) np.testing.assert_allclose(data.label, data.get_field("label"))
assert not np.isnan(data.label[0]) assert not np.isnan(data.label[0])
assert not np.isinf(data.label[1]) assert not np.isinf(data.label[1])
np.testing.assert_allclose(data.weight, data.get_weight()) np.testing.assert_allclose(data.weight, data.get_weight())
np.testing.assert_allclose(data.weight, data.get_field('weight')) np.testing.assert_allclose(data.weight, data.get_field("weight"))
assert not np.isnan(data.weight[0]) assert not np.isnan(data.weight[0])
assert not np.isinf(data.weight[1]) assert not np.isinf(data.weight[1])
np.testing.assert_allclose(data.init_score, data.get_init_score()) np.testing.assert_allclose(data.init_score, data.get_init_score())
np.testing.assert_allclose(data.init_score, data.get_field('init_score')) np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
assert not np.isnan(data.init_score[0]) assert not np.isnan(data.init_score[0])
assert not np.isinf(data.init_score[1]) assert not np.isinf(data.init_score[1])
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
data.label[0]))
assert data.label[1] == pytest.approx(data.weight[1]) assert data.label[1] == pytest.approx(data.weight[1])
assert data.feature_name == data.get_feature_name() assert data.feature_name == data.get_feature_name()
...@@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields(): ...@@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields():
sequence = np.ones(y.shape[0]) sequence = np.ones(y.shape[0])
sequence[0] = np.nan sequence[0] = np.nan
sequence[1] = np.inf sequence[1] = np.inf
feature_names = [f'f{i}'for i in range(X.shape[1])] feature_names = [f"f{i}" for i in range(X.shape[1])]
lgb_data = lgb.Dataset(X, sequence, lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
weight=sequence, init_score=sequence,
feature_name=feature_names).construct()
check_asserts(lgb_data) check_asserts(lgb_data)
lgb_data = lgb.Dataset(X, y).construct() lgb_data = lgb.Dataset(X, y).construct()
lgb_data.set_label(sequence) lgb_data.set_label(sequence)
...@@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields(): ...@@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields():
def test_dataset_construction_overwrites_user_provided_metadata_fields(): def test_dataset_construction_overwrites_user_provided_metadata_fields():
X = np.array([[1.0, 2.0], [3.0, 4.0]]) X = np.array([[1.0, 2.0], [3.0, 4.0]])
position = np.array([0.0, 1.0], dtype=np.float32) position = np.array([0.0, 1.0], dtype=np.float32)
if getenv('TASK', '') == 'cuda': if getenv("TASK", "") == "cuda":
position = None position = None
dtrain = lgb.Dataset( dtrain = lgb.Dataset(
X, X,
params={ params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
"min_data_in_bin": 1,
"min_data_in_leaf": 1,
"verbosity": -1
},
group=[1, 1], group=[1, 1],
init_score=[0.312, 0.708], init_score=[0.312, 0.708],
label=[1, 2], label=[1, 2],
...@@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields(): ...@@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
assert dtrain.get_init_score() == [0.312, 0.708] assert dtrain.get_init_score() == [0.312, 0.708]
assert dtrain.label == [1, 2] assert dtrain.label == [1, 2]
assert dtrain.get_label() == [1, 2] assert dtrain.get_label() == [1, 2]
if getenv('TASK', '') != 'cuda': if getenv("TASK", "") != "cuda":
np_assert_array_equal( np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
dtrain.position, np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
np_assert_array_equal(
dtrain.get_position(),
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
assert dtrain.weight == [0.5, 1.5] assert dtrain.weight == [0.5, 1.5]
assert dtrain.get_weight() == [0.5, 1.5] assert dtrain.get_weight() == [0.5, 1.5]
...@@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields(): ...@@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.group, expected_group, strict=True) np_assert_array_equal(dtrain.group, expected_group, strict=True)
np_assert_array_equal(dtrain.get_group(), expected_group, strict=True) np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
# get_field("group") returns a numpy array with boundaries, instead of size # get_field("group") returns a numpy array with boundaries, instead of size
np_assert_array_equal( np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
dtrain.get_field("group"),
np.array([0, 1, 2], dtype=np.int32),
strict=True
)
expected_init_score = np.array([0.312, 0.708],) expected_init_score = np.array(
[0.312, 0.708],
)
np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True) np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True) np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True) np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)
...@@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields(): ...@@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.get_label(), expected_label, strict=True) np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True) np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)
if getenv('TASK', '') != 'cuda': if getenv("TASK", "") != "cuda":
expected_position = np.array([0.0, 1.0], dtype=np.float32) expected_position = np.array([0.0, 1.0], dtype=np.float32)
np_assert_array_equal(dtrain.position, expected_position, strict=True) np_assert_array_equal(dtrain.position, expected_position, strict=True)
np_assert_array_equal(dtrain.get_position(), expected_position, strict=True) np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
# NOTE: "position" is converted to int32 on the C++ side # NOTE: "position" is converted to int32 on the C++ side
np_assert_array_equal( np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
dtrain.get_field("position"),
np.array([0.0, 1.0], dtype=np.int32),
strict=True
)
expected_weight = np.array([0.5, 1.5], dtype=np.float32) expected_weight = np.array([0.5, 1.5], dtype=np.float32)
np_assert_array_equal(dtrain.weight, expected_weight, strict=True) np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
...@@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields(): ...@@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
def test_choose_param_value(): def test_choose_param_value():
original_params = { original_params = {
"local_listen_port": 1234, "local_listen_port": 1234,
"port": 2222, "port": 2222,
...@@ -599,30 +584,20 @@ def test_choose_param_value(): ...@@ -599,30 +584,20 @@ def test_choose_param_value():
# should resolve duplicate aliases, and prefer the main parameter # should resolve duplicate aliases, and prefer the main parameter
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(
main_param_name="local_listen_port", main_param_name="local_listen_port", params=original_params, default_value=5555
params=original_params,
default_value=5555
) )
assert params["local_listen_port"] == 1234 assert params["local_listen_port"] == 1234
assert "port" not in params assert "port" not in params
# should choose the highest priority alias and set that value on main param # should choose the highest priority alias and set that value on main param
# if only aliases are used # if only aliases are used
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
main_param_name="num_iterations",
params=params,
default_value=17
)
assert params["num_iterations"] == 13 assert params["num_iterations"] == 13
assert "num_trees" not in params assert "num_trees" not in params
assert "n_iter" not in params assert "n_iter" not in params
# should use the default if main param and aliases are missing # should use the default if main param and aliases are missing
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
main_param_name="learning_rate",
params=params,
default_value=0.789
)
assert params["learning_rate"] == 0.789 assert params["learning_rate"] == 0.789
# all changes should be made on copies and not modify the original # all changes should be made on copies and not modify the original
...@@ -637,37 +612,23 @@ def test_choose_param_value(): ...@@ -637,37 +612,23 @@ def test_choose_param_value():
def test_choose_param_value_preserves_nones(): def test_choose_param_value_preserves_nones():
# preserves None found for main param and still removes aliases # preserves None found for main param and still removes aliases
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(
main_param_name="num_threads", main_param_name="num_threads",
params={ params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
"num_threads": None, default_value=2,
"n_jobs": 4,
"objective": "regression"
},
default_value=2
) )
assert params == {"num_threads": None, "objective": "regression"} assert params == {"num_threads": None, "objective": "regression"}
# correctly chooses value when only an alias is provided # correctly chooses value when only an alias is provided
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(
main_param_name="num_threads", main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
params={
"n_jobs": None,
"objective": "regression"
},
default_value=2
) )
assert params == {"num_threads": None, "objective": "regression"} assert params == {"num_threads": None, "objective": "regression"}
# adds None if that's given as the default and param not found # adds None if that's given as the default and param not found
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(
main_param_name="min_data_in_leaf", main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
params={
"objective": "regression"
},
default_value=None
) )
assert params == {"objective": "regression", "min_data_in_leaf": None} assert params == {"objective": "regression", "min_data_in_leaf": None}
...@@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones(): ...@@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones():
def test_choose_param_value_objective(objective_alias): def test_choose_param_value_objective(objective_alias):
# If callable is found in objective # If callable is found in objective
params = {objective_alias: dummy_obj} params = {objective_alias: dummy_obj}
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
main_param_name="objective", assert params["objective"] == dummy_obj
params=params,
default_value=None
)
assert params['objective'] == dummy_obj
# Value in params should be preferred to the default_value passed from keyword arguments # Value in params should be preferred to the default_value passed from keyword arguments
params = {objective_alias: dummy_obj} params = {objective_alias: dummy_obj}
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
main_param_name="objective", assert params["objective"] == dummy_obj
params=params,
default_value=mse_obj
)
assert params['objective'] == dummy_obj
# None of objective or its aliases in params, but default_value is callable. # None of objective or its aliases in params, but default_value is callable.
params = {} params = {}
params = lgb.basic._choose_param_value( params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
main_param_name="objective", assert params["objective"] == mse_obj
params=params,
default_value=mse_obj
)
assert params['objective'] == mse_obj
@pytest.mark.parametrize('collection', ['1d_np', '2d_np', 'pd_float', 'pd_str', '1d_list', '2d_list']) @pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize('dtype', [np.float32, np.float64]) @pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_list_to_1d_numpy(collection, dtype): def test_list_to_1d_numpy(collection, dtype):
collection2y = { collection2y = {
'1d_np': np.random.rand(10), "1d_np": np.random.rand(10),
'2d_np': np.random.rand(10, 1), "2d_np": np.random.rand(10, 1),
'pd_float': np.random.rand(10), "pd_float": np.random.rand(10),
'pd_str': ['a', 'b'], "pd_str": ["a", "b"],
'1d_list': [1] * 10, "1d_list": [1] * 10,
'2d_list': [[1], [2]], "2d_list": [[1], [2]],
} }
y = collection2y[collection] y = collection2y[collection]
if collection.startswith('pd'): if collection.startswith("pd"):
if not PANDAS_INSTALLED: if not PANDAS_INSTALLED:
pytest.skip('pandas is not installed') pytest.skip("pandas is not installed")
else: else:
y = pd_Series(y) y = pd_Series(y)
if isinstance(y, np.ndarray) and len(y.shape) == 2: if isinstance(y, np.ndarray) and len(y.shape) == 2:
with pytest.warns(UserWarning, match='column-vector'): with pytest.warns(UserWarning, match="column-vector"):
lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list") lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
return return
elif isinstance(y, list) and isinstance(y[0], list): elif isinstance(y, list) and isinstance(y[0], list):
...@@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype): ...@@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype):
assert result.dtype == dtype assert result.dtype == dtype
@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list']) @pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
def test_init_score_for_multiclass_classification(init_score_type): def test_init_score_for_multiclass_classification(init_score_type):
init_score = [[i * 10 + j for j in range(3)] for i in range(10)] init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
if init_score_type == 'array': if init_score_type == "array":
init_score = np.array(init_score) init_score = np.array(init_score)
elif init_score_type == 'dataframe': elif init_score_type == "dataframe":
if not PANDAS_INSTALLED: if not PANDAS_INSTALLED:
pytest.skip('Pandas is not installed.') pytest.skip("Pandas is not installed.")
init_score = pd_DataFrame(init_score) init_score = pd_DataFrame(init_score)
data = np.random.rand(10, 2) data = np.random.rand(10, 2)
ds = lgb.Dataset(data, init_score=init_score).construct() ds = lgb.Dataset(data, init_score=init_score).construct()
np.testing.assert_equal(ds.get_field('init_score'), init_score) np.testing.assert_equal(ds.get_field("init_score"), init_score)
np.testing.assert_equal(ds.init_score, init_score) np.testing.assert_equal(ds.init_score, init_score)
def test_smoke_custom_parser(tmp_path): def test_smoke_custom_parser(tmp_path):
data_path = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification' / 'binary.train' data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
parser_config_file = tmp_path / 'parser.ini' parser_config_file = tmp_path / "parser.ini"
with open(parser_config_file, 'w') as fout: with open(parser_config_file, "w") as fout:
fout.write('{"className": "dummy", "id": "1"}') fout.write('{"className": "dummy", "id": "1"}')
data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file}) data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
with pytest.raises(lgb.basic.LightGBMError, with pytest.raises(
match="Cannot find parser class 'dummy', please register first or check config format"): lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
):
data.construct() data.construct()
...@@ -770,9 +720,13 @@ def test_param_aliases(): ...@@ -770,9 +720,13 @@ def test_param_aliases():
assert all(isinstance(i, list) for i in aliases.values()) assert all(isinstance(i, list) for i in aliases.values())
assert all(len(i) >= 1 for i in aliases.values()) assert all(len(i) >= 1 for i in aliases.values())
assert all(k in v for k, v in aliases.items()) assert all(k in v for k, v in aliases.items())
assert lgb.basic._ConfigAliases.get('config', 'task') == {'config', 'config_file', 'task', 'task_type'} assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
assert lgb.basic._ConfigAliases.get_sorted('min_data_in_leaf') == [ assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
'min_data_in_leaf', 'min_data', 'min_samples_leaf', 'min_child_samples', 'min_data_per_leaf' "min_data_in_leaf",
"min_data",
"min_samples_leaf",
"min_child_samples",
"min_data_per_leaf",
] ]
...@@ -793,10 +747,10 @@ def test_custom_objective_safety(): ...@@ -793,10 +747,10 @@ def test_custom_objective_safety():
y_multiclass = np.arange(nrows) % nclass y_multiclass = np.arange(nrows) % nclass
ds_binary = lgb.Dataset(X, y_binary).construct() ds_binary = lgb.Dataset(X, y_binary).construct()
ds_multiclass = lgb.Dataset(X, y_multiclass).construct() ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
bad_bst_binary = lgb.Booster({'objective': "none"}, ds_binary) bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
good_bst_binary = lgb.Booster({'objective': "none"}, ds_binary) good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
bad_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass) bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
good_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass) good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
good_bst_binary.update(fobj=_good_gradients) good_bst_binary.update(fobj=_good_gradients)
with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")): with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
bad_bst_binary.update(fobj=_bad_gradients) bad_bst_binary.update(fobj=_bad_gradients)
...@@ -805,33 +759,30 @@ def test_custom_objective_safety(): ...@@ -805,33 +759,30 @@ def test_custom_objective_safety():
bad_bst_multi.update(fobj=_bad_gradients) bad_bst_multi.update(fobj=_bad_gradients)
@pytest.mark.parametrize('dtype', [np.float32, np.float64]) @pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto']) @pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas') pd = pytest.importorskip("pandas")
X = np.random.rand(10, 2).astype(dtype) X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X) df = pd.DataFrame(X)
built_data = lgb.basic._data_from_pandas( built_data = lgb.basic._data_from_pandas(
data=df, data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0] )[0]
assert built_data.dtype == dtype assert built_data.dtype == dtype
assert np.shares_memory(X, built_data) assert np.shares_memory(X, built_data)
@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto']) @pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize('categories', ['seen', 'unseen']) @pytest.mark.parametrize("categories", ["seen", "unseen"])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories): def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
pd = pytest.importorskip('pandas') pd = pytest.importorskip("pandas")
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) X = np.random.choice(["a", "b"], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0] column_name = "a" if feature_name == "auto" else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
if categories == 'seen': if categories == "seen":
pandas_categorical = [['a', 'b']] pandas_categorical = [["a", "b"]]
else: else:
pandas_categorical = [['a']] pandas_categorical = [["a"]]
data = lgb.basic._data_from_pandas( data = lgb.basic._data_from_pandas(
data=df, data=df,
feature_name=feature_name, feature_name=feature_name,
...@@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c ...@@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
# check that the original data wasn't modified # check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0]) np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes # check that the built data has the codes
if categories == 'seen': if categories == "seen":
# if all categories were seen during training we just take the codes # if all categories were seen during training we just take the codes
codes = df[column_name].cat.codes codes = df[column_name].cat.codes
else: else:
# if we only saw 'a' during training we just replace its code # if we only saw 'a' during training we just replace its code
# and leave the rest as nan # and leave the rest as nan
a_code = df[column_name].cat.categories.get_loc('a') a_code = df[column_name].cat.categories.get_loc("a")
codes = np.where(df[column_name] == 'a', a_code, np.nan) codes = np.where(df[column_name] == "a", a_code, np.nan)
np.testing.assert_equal(codes, data[:, 0]) np.testing.assert_equal(codes, data[:, 0])
@pytest.mark.parametrize('min_data_in_bin', [2, 10]) @pytest.mark.parametrize("min_data_in_bin", [2, 10])
def test_feature_num_bin(min_data_in_bin): def test_feature_num_bin(min_data_in_bin):
X = np.vstack([ X = np.vstack(
np.random.rand(100), [
np.array([1, 2] * 50), np.random.rand(100),
np.array([0, 1, 2] * 33 + [0]), np.array([1, 2] * 50),
np.array([1, 2] * 49 + 2 * [np.nan]), np.array([0, 1, 2] * 33 + [0]),
np.zeros(100), np.array([1, 2] * 49 + 2 * [np.nan]),
np.random.choice([0, 1], 100), np.zeros(100),
]).T np.random.choice([0, 1], 100),
]
).T
n_continuous = X.shape[1] - 1 n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1'] feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
ds_kwargs = { ds_kwargs = {
"params": {'min_data_in_bin': min_data_in_bin}, "params": {"min_data_in_bin": min_data_in_bin},
"categorical_feature": [n_continuous], # last feature "categorical_feature": [n_continuous], # last feature
} }
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct() ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
...@@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin): ...@@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin):
assert bins_by_name == expected_num_bins assert bins_by_name == expected_num_bins
# test using default feature names # test using default feature names
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct() ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
default_names = [f'Column_{i}' for i in range(X.shape[1])] default_names = [f"Column_{i}" for i in range(X.shape[1])]
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names] bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
assert bins_by_default_name == expected_num_bins assert bins_by_default_name == expected_num_bins
# check for feature indices outside of range # check for feature indices outside of range
...@@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin): ...@@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin):
with pytest.raises( with pytest.raises(
lgb.basic.LightGBMError, lgb.basic.LightGBMError,
match=( match=(
f'Tried to retrieve number of bins for feature index {num_features}, ' f"Tried to retrieve number of bins for feature index {num_features}, "
f'but the valid feature indices are \\[0, {num_features - 1}\\].' f"but the valid feature indices are \\[0, {num_features - 1}\\]."
) ),
): ):
ds.feature_num_bin(num_features) ds.feature_num_bin(num_features)
...@@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin): ...@@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin):
def test_feature_num_bin_with_max_bin_by_feature(): def test_feature_num_bin_with_max_bin_by_feature():
X = np.random.rand(100, 3) X = np.random.rand(100, 3)
max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1]) max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
ds = lgb.Dataset(X, params={'max_bin_by_feature': max_bin_by_feature}).construct() ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])] actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
np.testing.assert_equal(actual_num_bins, max_bin_by_feature) np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
...@@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature(): ...@@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature():
def test_set_leaf_output(): def test_set_leaf_output():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
ds = lgb.Dataset(X, y) ds = lgb.Dataset(X, y)
bst = lgb.Booster({'num_leaves': 2}, ds) bst = lgb.Booster({"num_leaves": 2}, ds)
bst.update() bst.update()
y_pred = bst.predict(X) y_pred = bst.predict(X)
for leaf_id in range(2): for leaf_id in range(2):
......
...@@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round): ...@@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round):
return 0.6 if boosting_round < 15 else 0.8 return 0.6 if boosting_round < 15 else 0.8
@pytest.mark.parametrize('serializer', SERIALIZERS) @pytest.mark.parametrize("serializer", SERIALIZERS)
def test_early_stopping_callback_is_picklable(serializer): def test_early_stopping_callback_is_picklable(serializer):
rounds = 5 rounds = 5
callback = lgb.early_stopping(stopping_rounds=rounds) callback = lgb.early_stopping(stopping_rounds=rounds)
...@@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ ...@@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ
lgb.early_stopping(stopping_rounds="neverrrr") lgb.early_stopping(stopping_rounds="neverrrr")
@pytest.mark.parametrize('serializer', SERIALIZERS) @pytest.mark.parametrize("serializer", SERIALIZERS)
def test_log_evaluation_callback_is_picklable(serializer): def test_log_evaluation_callback_is_picklable(serializer):
periods = 42 periods = 42
callback = lgb.log_evaluation(period=periods) callback = lgb.log_evaluation(period=periods)
...@@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer): ...@@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer):
assert callback.period == periods assert callback.period == periods
@pytest.mark.parametrize('serializer', SERIALIZERS) @pytest.mark.parametrize("serializer", SERIALIZERS)
def test_record_evaluation_callback_is_picklable(serializer): def test_record_evaluation_callback_is_picklable(serializer):
results = {} results = {}
callback = lgb.record_evaluation(eval_result=results) callback = lgb.record_evaluation(eval_result=results)
...@@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer): ...@@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer):
assert callback.eval_result is results assert callback.eval_result is results
@pytest.mark.parametrize('serializer', SERIALIZERS) @pytest.mark.parametrize("serializer", SERIALIZERS)
def test_reset_parameter_callback_is_picklable(serializer): def test_reset_parameter_callback_is_picklable(serializer):
params = { params = {"bagging_fraction": [0.7] * 5 + [0.6] * 5, "feature_fraction": reset_feature_fraction}
'bagging_fraction': [0.7] * 5 + [0.6] * 5,
'feature_fraction': reset_feature_fraction
}
callback = lgb.reset_parameter(**params) callback = lgb.reset_parameter(**params)
callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer) callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
assert callback_from_disk.order == 10 assert callback_from_disk.order == 10
......
...@@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file ...@@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file
import lightgbm as lgb import lightgbm as lgb
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / 'examples' EXAMPLES_DIR = Path(__file__).absolute().parents[2] / "examples"
class FileLoader: class FileLoader:
def __init__(self, directory, prefix, config_file="train.conf"):
def __init__(self, directory, prefix, config_file='train.conf'):
self.directory = directory self.directory = directory
self.prefix = prefix self.prefix = prefix
self.params = {'gpu_use_dp': True} self.params = {"gpu_use_dp": True}
with open(self.directory / config_file, 'r') as f: with open(self.directory / config_file, "r") as f:
for line in f.readlines(): for line in f.readlines():
line = line.strip() line = line.strip()
if line and not line.startswith('#'): if line and not line.startswith("#"):
key, value = [token.strip() for token in line.split('=')] key, value = [token.strip() for token in line.split("=")]
if 'early_stopping' not in key: # disable early_stopping if "early_stopping" not in key: # disable early_stopping
self.params[key] = value if key not in {'num_trees', 'num_threads'} else int(value) self.params[key] = value if key not in {"num_trees", "num_threads"} else int(value)
def load_dataset(self, suffix, is_sparse=False): def load_dataset(self, suffix, is_sparse=False):
filename = str(self.path(suffix)) filename = str(self.path(suffix))
...@@ -33,14 +32,14 @@ class FileLoader: ...@@ -33,14 +32,14 @@ class FileLoader:
return mat[:, 1:], mat[:, 0], filename return mat[:, 1:], mat[:, 0], filename
def load_field(self, suffix): def load_field(self, suffix):
return np.loadtxt(str(self.directory / f'{self.prefix}{suffix}')) return np.loadtxt(str(self.directory / f"{self.prefix}{suffix}"))
def load_cpp_result(self, result_file='LightGBM_predict_result.txt'): def load_cpp_result(self, result_file="LightGBM_predict_result.txt"):
return np.loadtxt(str(self.directory / result_file)) return np.loadtxt(str(self.directory / result_file))
def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred): def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
params = dict(self.params) params = dict(self.params)
params['force_row_wise'] = True params["force_row_wise"] = True
gbm = lgb.train(params, lgb_train) gbm = lgb.train(params, lgb_train)
y_pred = gbm.predict(X_test) y_pred = gbm.predict(X_test)
cpp_pred = gbm.predict(X_test_fn) cpp_pred = gbm.predict(X_test_fn)
...@@ -49,7 +48,7 @@ class FileLoader: ...@@ -49,7 +48,7 @@ class FileLoader:
def file_load_check(self, lgb_train, name): def file_load_check(self, lgb_train, name):
lgb_train_f = lgb.Dataset(self.path(name), params=self.params).construct() lgb_train_f = lgb.Dataset(self.path(name), params=self.params).construct()
for f in ('num_data', 'num_feature', 'get_label', 'get_weight', 'get_init_score', 'get_group'): for f in ("num_data", "num_feature", "get_label", "get_weight", "get_init_score", "get_group"):
a = getattr(lgb_train, f)() a = getattr(lgb_train, f)()
b = getattr(lgb_train_f, f)() b = getattr(lgb_train_f, f)()
if a is None and b is None: if a is None and b is None:
...@@ -62,83 +61,83 @@ class FileLoader: ...@@ -62,83 +61,83 @@ class FileLoader:
assert a == b, f assert a == b, f
def path(self, suffix): def path(self, suffix):
return self.directory / f'{self.prefix}{suffix}' return self.directory / f"{self.prefix}{suffix}"
def test_binary(): def test_binary():
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary') fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary")
X_train, y_train, _ = fd.load_dataset('.train') X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset('.test') X_test, _, X_test_fn = fd.load_dataset(".test")
weight_train = fd.load_field('.train.weight') weight_train = fd.load_field(".train.weight")
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train) lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params) gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train, sample_weight=weight_train) gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1] sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train') fd.file_load_check(lgb_train, ".train")
def test_binary_linear(): def test_binary_linear():
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary', 'train_linear.conf') fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary", "train_linear.conf")
X_train, y_train, _ = fd.load_dataset('.train') X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset('.test') X_test, _, X_test_fn = fd.load_dataset(".test")
weight_train = fd.load_field('.train.weight') weight_train = fd.load_field(".train.weight")
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train) lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params) gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train, sample_weight=weight_train) gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1] sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train') fd.file_load_check(lgb_train, ".train")
def test_multiclass(): def test_multiclass():
fd = FileLoader(EXAMPLES_DIR / 'multiclass_classification', 'multiclass') fd = FileLoader(EXAMPLES_DIR / "multiclass_classification", "multiclass")
X_train, y_train, _ = fd.load_dataset('.train') X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset('.test') X_test, _, X_test_fn = fd.load_dataset(".test")
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.LGBMClassifier(**fd.params) gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
sk_pred = gbm.predict_proba(X_test) sk_pred = gbm.predict_proba(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train') fd.file_load_check(lgb_train, ".train")
def test_regression(): def test_regression():
fd = FileLoader(EXAMPLES_DIR / 'regression', 'regression') fd = FileLoader(EXAMPLES_DIR / "regression", "regression")
X_train, y_train, _ = fd.load_dataset('.train') X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset('.test') X_test, _, X_test_fn = fd.load_dataset(".test")
init_score_train = fd.load_field('.train.init') init_score_train = fd.load_field(".train.init")
lgb_train = lgb.Dataset(X_train, y_train, init_score=init_score_train) lgb_train = lgb.Dataset(X_train, y_train, init_score=init_score_train)
gbm = lgb.LGBMRegressor(**fd.params) gbm = lgb.LGBMRegressor(**fd.params)
gbm.fit(X_train, y_train, init_score=init_score_train) gbm.fit(X_train, y_train, init_score=init_score_train)
sk_pred = gbm.predict(X_test) sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train') fd.file_load_check(lgb_train, ".train")
def test_lambdarank(): def test_lambdarank():
fd = FileLoader(EXAMPLES_DIR / 'lambdarank', 'rank') fd = FileLoader(EXAMPLES_DIR / "lambdarank", "rank")
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True) X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True) X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
group_train = fd.load_field('.train.query') group_train = fd.load_field(".train.query")
lgb_train = lgb.Dataset(X_train, y_train, group=group_train) lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
params = dict(fd.params) params = dict(fd.params)
params['force_col_wise'] = True params["force_col_wise"] = True
gbm = lgb.LGBMRanker(**params) gbm = lgb.LGBMRanker(**params)
gbm.fit(X_train, y_train, group=group_train) gbm.fit(X_train, y_train, group=group_train)
sk_pred = gbm.predict(X_test) sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train') fd.file_load_check(lgb_train, ".train")
def test_xendcg(): def test_xendcg():
fd = FileLoader(EXAMPLES_DIR / 'xendcg', 'rank') fd = FileLoader(EXAMPLES_DIR / "xendcg", "rank")
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True) X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True) X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
group_train = fd.load_field('.train.query') group_train = fd.load_field(".train.query")
lgb_train = lgb.Dataset(X_train, y_train, group=group_train) lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
gbm = lgb.LGBMRanker(**fd.params) gbm = lgb.LGBMRanker(**fd.params)
gbm.fit(X_train, y_train, group=group_train) gbm.fit(X_train, y_train, group=group_train)
sk_pred = gbm.predict(X_test) sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred) fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train') fd.file_load_check(lgb_train, ".train")
...@@ -17,12 +17,12 @@ import lightgbm as lgb ...@@ -17,12 +17,12 @@ import lightgbm as lgb
from .utils import sklearn_multiclass_custom_objective from .utils import sklearn_multiclass_custom_objective
if not platform.startswith('linux'): if not platform.startswith("linux"):
pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True) pytest.skip("lightgbm.dask is currently supported in Linux environments", allow_module_level=True)
if machine() != 'x86_64': if machine() != "x86_64":
pytest.skip('lightgbm.dask tests are currently skipped on some architectures like arm64', allow_module_level=True) pytest.skip("lightgbm.dask tests are currently skipped on some architectures like arm64", allow_module_level=True)
if not lgb.compat.DASK_INSTALLED: if not lgb.compat.DASK_INSTALLED:
pytest.skip('Dask is not installed', allow_module_level=True) pytest.skip("Dask is not installed", allow_module_level=True)
import dask.array as da import dask.array as da
import dask.dataframe as dd import dask.dataframe as dd
...@@ -37,46 +37,46 @@ from sklearn.datasets import make_blobs, make_regression ...@@ -37,46 +37,46 @@ from sklearn.datasets import make_blobs, make_regression
from .utils import make_ranking, pickle_obj, unpickle_obj from .utils import make_ranking, pickle_obj, unpickle_obj
tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking'] tasks = ["binary-classification", "multiclass-classification", "regression", "ranking"]
distributed_training_algorithms = ['data', 'voting'] distributed_training_algorithms = ["data", "voting"]
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical'] data_output = ["array", "scipy_csr_matrix", "dataframe", "dataframe-with-categorical"]
boosting_types = ['gbdt', 'dart', 'goss', 'rf'] boosting_types = ["gbdt", "dart", "goss", "rf"]
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50] group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
task_to_dask_factory = { task_to_dask_factory = {
'regression': lgb.DaskLGBMRegressor, "regression": lgb.DaskLGBMRegressor,
'binary-classification': lgb.DaskLGBMClassifier, "binary-classification": lgb.DaskLGBMClassifier,
'multiclass-classification': lgb.DaskLGBMClassifier, "multiclass-classification": lgb.DaskLGBMClassifier,
'ranking': lgb.DaskLGBMRanker "ranking": lgb.DaskLGBMRanker,
} }
task_to_local_factory = { task_to_local_factory = {
'regression': lgb.LGBMRegressor, "regression": lgb.LGBMRegressor,
'binary-classification': lgb.LGBMClassifier, "binary-classification": lgb.LGBMClassifier,
'multiclass-classification': lgb.LGBMClassifier, "multiclass-classification": lgb.LGBMClassifier,
'ranking': lgb.LGBMRanker "ranking": lgb.LGBMRanker,
} }
pytestmark = [ pytestmark = [
pytest.mark.skipif(getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface'), pytest.mark.skipif(getenv("TASK", "") == "mpi", reason="Fails to run with MPI interface"),
pytest.mark.skipif(getenv('TASK', '') == 'gpu', reason='Fails to run with GPU interface'), pytest.mark.skipif(getenv("TASK", "") == "gpu", reason="Fails to run with GPU interface"),
pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Fails to run with CUDA interface') pytest.mark.skipif(getenv("TASK", "") == "cuda", reason="Fails to run with CUDA interface"),
] ]
@pytest.fixture(scope='module') @pytest.fixture(scope="module")
def cluster(): def cluster():
dask_cluster = LocalCluster(n_workers=2, threads_per_worker=2, dashboard_address=None) dask_cluster = LocalCluster(n_workers=2, threads_per_worker=2, dashboard_address=None)
yield dask_cluster yield dask_cluster
dask_cluster.close() dask_cluster.close()
@pytest.fixture(scope='module') @pytest.fixture(scope="module")
def cluster2(): def cluster2():
dask_cluster = LocalCluster(n_workers=2, threads_per_worker=2, dashboard_address=None) dask_cluster = LocalCluster(n_workers=2, threads_per_worker=2, dashboard_address=None)
yield dask_cluster yield dask_cluster
dask_cluster.close() dask_cluster.close()
@pytest.fixture(scope='module') @pytest.fixture(scope="module")
def cluster_three_workers(): def cluster_three_workers():
dask_cluster = LocalCluster(n_workers=3, threads_per_worker=1, dashboard_address=None) dask_cluster = LocalCluster(n_workers=3, threads_per_worker=1, dashboard_address=None)
yield dask_cluster yield dask_cluster
...@@ -93,46 +93,43 @@ listen_port.port = 13000 ...@@ -93,46 +93,43 @@ listen_port.port = 13000
def _get_workers_hostname(cluster: LocalCluster) -> str: def _get_workers_hostname(cluster: LocalCluster) -> str:
one_worker_address = next(iter(cluster.scheduler_info['workers'])) one_worker_address = next(iter(cluster.scheduler_info["workers"]))
return urlparse(one_worker_address).hostname return urlparse(one_worker_address).hostname
def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs): def _create_ranking_data(n_samples=100, output="array", chunk_size=50, **kwargs):
X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs) X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs)
rnd = np.random.RandomState(42) rnd = np.random.RandomState(42)
w = rnd.rand(X.shape[0]) * 0.01 w = rnd.rand(X.shape[0]) * 0.01
g_rle = np.array([len(list(grp)) for _, grp in groupby(g)]) g_rle = np.array([len(list(grp)) for _, grp in groupby(g)])
if output.startswith('dataframe'): if output.startswith("dataframe"):
# add target, weight, and group to DataFrame so that partitions abide by group boundaries. # add target, weight, and group to DataFrame so that partitions abide by group boundaries.
X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])]) X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
for i in range(5): for i in range(5):
col_name = f"cat_col{i}" col_name = f"cat_col{i}"
cat_values = rnd.choice(['a', 'b'], X.shape[0]) cat_values = rnd.choice(["a", "b"], X.shape[0])
cat_series = pd.Series( cat_series = pd.Series(cat_values, dtype="category")
cat_values,
dtype='category'
)
X_df[col_name] = cat_series X_df[col_name] = cat_series
X = X_df.copy() X = X_df.copy()
X_df = X_df.assign(y=y, g=g, w=w) X_df = X_df.assign(y=y, g=g, w=w)
# set_index ensures partitions are based on group id. # set_index ensures partitions are based on group id.
# See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function. # See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function.
X_df.set_index('g', inplace=True) X_df.set_index("g", inplace=True)
dX = dd.from_pandas(X_df, chunksize=chunk_size) dX = dd.from_pandas(X_df, chunksize=chunk_size)
# separate target, weight from features. # separate target, weight from features.
dy = dX['y'] dy = dX["y"]
dw = dX['w'] dw = dX["w"]
dX = dX.drop(columns=['y', 'w']) dX = dX.drop(columns=["y", "w"])
dg = dX.index.to_series() dg = dX.index.to_series()
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# so that within each partition, sum(g) = n_samples. # so that within each partition, sum(g) = n_samples.
dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0])) dg = dg.map_partitions(lambda p: p.groupby("g", sort=False).apply(lambda z: z.shape[0]))
elif output == 'array': elif output == "array":
# ranking arrays: one chunk per group. Each chunk must include all columns. # ranking arrays: one chunk per group. Each chunk must include all columns.
p = X.shape[1] p = X.shape[1]
dX, dy, dw, dg = [], [], [], [] dX, dy, dw, dg = [], [], [], []
...@@ -148,71 +145,63 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs) ...@@ -148,71 +145,63 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
dw = da.concatenate(dw, axis=0) dw = da.concatenate(dw, axis=0)
dg = da.concatenate(dg, axis=0) dg = da.concatenate(dg, axis=0)
else: else:
raise ValueError('Ranking data creation only supported for Dask arrays and dataframes') raise ValueError("Ranking data creation only supported for Dask arrays and dataframes")
return X, y, w, g_rle, dX, dy, dw, dg return X, y, w, g_rle, dX, dy, dw, dg
def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs): def _create_data(objective, n_samples=1_000, output="array", chunk_size=500, **kwargs):
if objective.endswith('classification'): if objective.endswith("classification"):
if objective == 'binary-classification': if objective == "binary-classification":
centers = [[-4, -4], [4, 4]] centers = [[-4, -4], [4, 4]]
elif objective == 'multiclass-classification': elif objective == "multiclass-classification":
centers = [[-4, -4], [4, 4], [-4, 4]] centers = [[-4, -4], [4, 4], [-4, 4]]
else: else:
raise ValueError(f"Unknown classification task '{objective}'") raise ValueError(f"Unknown classification task '{objective}'")
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
elif objective == 'regression': elif objective == "regression":
X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42) X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42)
elif objective == 'ranking': elif objective == "ranking":
return _create_ranking_data( return _create_ranking_data(n_samples=n_samples, output=output, chunk_size=chunk_size, **kwargs)
n_samples=n_samples,
output=output,
chunk_size=chunk_size,
**kwargs
)
else: else:
raise ValueError(f"Unknown objective '{objective}'") raise ValueError(f"Unknown objective '{objective}'")
rnd = np.random.RandomState(42) rnd = np.random.RandomState(42)
weights = rnd.random(X.shape[0]) * 0.01 weights = rnd.random(X.shape[0]) * 0.01
if output == 'array': if output == "array":
dX = da.from_array(X, (chunk_size, X.shape[1])) dX = da.from_array(X, (chunk_size, X.shape[1]))
dy = da.from_array(y, chunk_size) dy = da.from_array(y, chunk_size)
dw = da.from_array(weights, chunk_size) dw = da.from_array(weights, chunk_size)
elif output.startswith('dataframe'): elif output.startswith("dataframe"):
X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])]) X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
num_cat_cols = 2 num_cat_cols = 2
for i in range(num_cat_cols): for i in range(num_cat_cols):
col_name = f"cat_col{i}" col_name = f"cat_col{i}"
cat_values = rnd.choice(['a', 'b'], X.shape[0]) cat_values = rnd.choice(["a", "b"], X.shape[0])
cat_series = pd.Series( cat_series = pd.Series(cat_values, dtype="category")
cat_values,
dtype='category'
)
X_df[col_name] = cat_series X_df[col_name] = cat_series
X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1))) X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))
# make one categorical feature relevant to the target # make one categorical feature relevant to the target
cat_col_is_a = X_df['cat_col0'] == 'a' cat_col_is_a = X_df["cat_col0"] == "a"
if objective == 'regression': if objective == "regression":
y = np.where(cat_col_is_a, y, 2 * y) y = np.where(cat_col_is_a, y, 2 * y)
elif objective == 'binary-classification': elif objective == "binary-classification":
y = np.where(cat_col_is_a, y, 1 - y) y = np.where(cat_col_is_a, y, 1 - y)
elif objective == 'multiclass-classification': elif objective == "multiclass-classification":
n_classes = 3 n_classes = 3
y = np.where(cat_col_is_a, y, (1 + y) % n_classes) y = np.where(cat_col_is_a, y, (1 + y) % n_classes)
y_df = pd.Series(y, name='target') y_df = pd.Series(y, name="target")
dX = dd.from_pandas(X_df, chunksize=chunk_size) dX = dd.from_pandas(X_df, chunksize=chunk_size)
dy = dd.from_pandas(y_df, chunksize=chunk_size) dy = dd.from_pandas(y_df, chunksize=chunk_size)
dw = dd.from_array(weights, chunksize=chunk_size) dw = dd.from_array(weights, chunksize=chunk_size)
elif output == 'scipy_csr_matrix': elif output == "scipy_csr_matrix":
dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix) dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix)
dy = da.from_array(y, chunks=chunk_size) dy = da.from_array(y, chunks=chunk_size)
dw = da.from_array(weights, chunk_size) dw = da.from_array(weights, chunk_size)
X = csr_matrix(X) X = csr_matrix(X)
elif output == 'scipy_csc_matrix': elif output == "scipy_csc_matrix":
dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix) dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix)
dy = da.from_array(y, chunks=chunk_size) dy = da.from_array(y, chunks=chunk_size)
dw = da.from_array(weights, chunk_size) dw = da.from_array(weights, chunk_size)
...@@ -234,7 +223,7 @@ def _accuracy_score(dy_true, dy_pred): ...@@ -234,7 +223,7 @@ def _accuracy_score(dy_true, dy_pred):
def _constant_metric(y_true, y_pred): def _constant_metric(y_true, y_pred):
metric_name = 'constant_metric' metric_name = "constant_metric"
value = 0.708 value = 0.708
is_higher_better = False is_higher_better = False
return metric_name, value, is_higher_better return metric_name, value, is_higher_better
...@@ -253,46 +242,32 @@ def _objective_logistic_regression(y_true, y_pred): ...@@ -253,46 +242,32 @@ def _objective_logistic_regression(y_true, y_pred):
return grad, hess return grad, hess
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification"])
@pytest.mark.parametrize('boosting_type', boosting_types) @pytest.mark.parametrize("boosting_type", boosting_types)
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms) @pytest.mark.parametrize("tree_learner", distributed_training_algorithms)
def test_classifier(output, task, boosting_type, tree_learner, cluster): def test_classifier(output, task, boosting_type, tree_learner, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective=task, output=output)
objective=task,
output=output params = {"boosting_type": boosting_type, "tree_learner": tree_learner, "n_estimators": 50, "num_leaves": 31}
) if boosting_type == "rf":
params.update(
{
"bagging_freq": 1,
"bagging_fraction": 0.9,
}
)
elif boosting_type == "goss":
params["top_rate"] = 0.5
params = { dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, **params)
"boosting_type": boosting_type,
"tree_learner": tree_learner,
"n_estimators": 50,
"num_leaves": 31
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
elif boosting_type == 'goss':
params['top_rate'] = 0.5
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
**params
)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
p1 = dask_classifier.predict(dX) p1 = dask_classifier.predict(dX)
p1_raw = dask_classifier.predict(dX, raw_score=True).compute() p1_raw = dask_classifier.predict(dX, raw_score=True).compute()
p1_first_iter_raw = dask_classifier.predict(dX, start_iteration=0, num_iteration=1, raw_score=True).compute() p1_first_iter_raw = dask_classifier.predict(dX, start_iteration=0, num_iteration=1, raw_score=True).compute()
p1_early_stop_raw = dask_classifier.predict( p1_early_stop_raw = dask_classifier.predict(
dX, dX, pred_early_stop=True, pred_early_stop_margin=1.0, pred_early_stop_freq=2, raw_score=True
pred_early_stop=True,
pred_early_stop_margin=1.0,
pred_early_stop_freq=2,
raw_score=True
).compute() ).compute()
p1_proba = dask_classifier.predict_proba(dX).compute() p1_proba = dask_classifier.predict_proba(dX).compute()
p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True) p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True)
...@@ -306,7 +281,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): ...@@ -306,7 +281,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
p2_proba = local_classifier.predict_proba(X) p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y) s2 = local_classifier.score(X, y)
if boosting_type == 'rf': if boosting_type == "rf":
# https://github.com/microsoft/LightGBM/issues/4118 # https://github.com/microsoft/LightGBM/issues/4118
assert_eq(s1, s2, atol=0.01) assert_eq(s1, s2, atol=0.01)
assert_eq(p1_proba, p2_proba, atol=0.8) assert_eq(p1_proba, p2_proba, atol=0.8)
...@@ -329,47 +304,30 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster): ...@@ -329,47 +304,30 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape # pref_leaf values should have the right shape
# and values that look like valid tree nodes # and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute() pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == ( assert pred_leaf_vals.shape == (X.shape[0], dask_classifier.booster_.num_trees())
X.shape[0], assert np.max(pred_leaf_vals) <= params["num_leaves"]
dask_classifier.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert np.min(pred_leaf_vals) >= 0 assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"]
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
cat_cols = [ cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
tree_df = dask_classifier.booster_.trees_to_dataframe() tree_df = dask_classifier.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0 assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output + ['scipy_csc_matrix']) @pytest.mark.parametrize("output", data_output + ["scipy_csc_matrix"])
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification"])
def test_classifier_pred_contrib(output, task, cluster): def test_classifier_pred_contrib(output, task, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective=task, output=output)
objective=task,
output=output
)
params = { params = {"n_estimators": 10, "num_leaves": 10}
"n_estimators": 10,
"num_leaves": 10
}
dask_classifier = lgb.DaskLGBMClassifier( dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, tree_learner="data", **params)
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True) preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True)
...@@ -390,10 +348,10 @@ def test_classifier_pred_contrib(output, task, cluster): ...@@ -390,10 +348,10 @@ def test_classifier_pred_contrib(output, task, cluster):
# #
# since that case is so different than all other cases, check the relevant things here # since that case is so different than all other cases, check the relevant things here
# and then return early # and then return early
if output.startswith('scipy') and task == 'multiclass-classification': if output.startswith("scipy") and task == "multiclass-classification":
if output == 'scipy_csr_matrix': if output == "scipy_csr_matrix":
expected_type = csr_matrix expected_type = csr_matrix
elif output == 'scipy_csc_matrix': elif output == "scipy_csc_matrix":
expected_type = csc_matrix expected_type = csc_matrix
else: else:
raise ValueError(f"Unrecognized output type: {output}") raise ValueError(f"Unrecognized output type: {output}")
...@@ -415,20 +373,17 @@ def test_classifier_pred_contrib(output, task, cluster): ...@@ -415,20 +373,17 @@ def test_classifier_pred_contrib(output, task, cluster):
return return
preds_with_contrib = preds_with_contrib.compute() preds_with_contrib = preds_with_contrib.compute()
if output.startswith('scipy'): if output.startswith("scipy"):
preds_with_contrib = preds_with_contrib.toarray() preds_with_contrib = preds_with_contrib.toarray()
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
cat_cols = [ cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
tree_df = dask_classifier.booster_.trees_to_dataframe() tree_df = dask_classifier.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0 assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
# * shape depends on whether it is binary or multiclass classification # * shape depends on whether it is binary or multiclass classification
# * matrix for binary classification is of the form [feature_contrib, base_value], # * matrix for binary classification is of the form [feature_contrib, base_value],
...@@ -446,8 +401,8 @@ def test_classifier_pred_contrib(output, task, cluster): ...@@ -446,8 +401,8 @@ def test_classifier_pred_contrib(output, task, cluster):
assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1) assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification']) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification"])
def test_classifier_custom_objective(output, task, cluster): def test_classifier_custom_objective(output, task, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(
...@@ -461,25 +416,19 @@ def test_classifier_custom_objective(output, task, cluster): ...@@ -461,25 +416,19 @@ def test_classifier_custom_objective(output, task, cluster):
"verbose": -1, "verbose": -1,
"seed": 708, "seed": 708,
"deterministic": True, "deterministic": True,
"force_col_wise": True "force_col_wise": True,
} }
if task == 'binary-classification': if task == "binary-classification":
params.update({ params.update(
'objective': _objective_logistic_regression, {
}) "objective": _objective_logistic_regression,
elif task == 'multiclass-classification': }
params.update({ )
'objective': sklearn_multiclass_custom_objective, elif task == "multiclass-classification":
'num_classes': 3 params.update({"objective": sklearn_multiclass_custom_objective, "num_classes": 3})
})
dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, tree_learner="data", **params)
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
dask_classifier_local = dask_classifier.to_local() dask_classifier_local = dask_classifier.to_local()
p1_raw = dask_classifier.predict(dX, raw_score=True).compute() p1_raw = dask_classifier.predict(dX, raw_score=True).compute()
...@@ -490,14 +439,14 @@ def test_classifier_custom_objective(output, task, cluster): ...@@ -490,14 +439,14 @@ def test_classifier_custom_objective(output, task, cluster):
p2_raw = local_classifier.predict(X, raw_score=True) p2_raw = local_classifier.predict(X, raw_score=True)
# with a custom objective, prediction result is a raw score instead of predicted class # with a custom objective, prediction result is a raw score instead of predicted class
if task == 'binary-classification': if task == "binary-classification":
p1_proba = 1.0 / (1.0 + np.exp(-p1_raw)) p1_proba = 1.0 / (1.0 + np.exp(-p1_raw))
p1_class = (p1_proba > 0.5).astype(np.int64) p1_class = (p1_proba > 0.5).astype(np.int64)
p1_proba_local = 1.0 / (1.0 + np.exp(-p1_raw_local)) p1_proba_local = 1.0 / (1.0 + np.exp(-p1_raw_local))
p1_class_local = (p1_proba_local > 0.5).astype(np.int64) p1_class_local = (p1_proba_local > 0.5).astype(np.int64)
p2_proba = 1.0 / (1.0 + np.exp(-p2_raw)) p2_proba = 1.0 / (1.0 + np.exp(-p2_raw))
p2_class = (p2_proba > 0.5).astype(np.int64) p2_class = (p2_proba > 0.5).astype(np.int64)
elif task == 'multiclass-classification': elif task == "multiclass-classification":
p1_proba = np.exp(p1_raw) / np.sum(np.exp(p1_raw), axis=1).reshape(-1, 1) p1_proba = np.exp(p1_raw) / np.sum(np.exp(p1_raw), axis=1).reshape(-1, 1)
p1_class = p1_proba.argmax(axis=1) p1_class = p1_proba.argmax(axis=1)
p1_proba_local = np.exp(p1_raw_local) / np.sum(np.exp(p1_raw_local), axis=1).reshape(-1, 1) p1_proba_local = np.exp(p1_raw_local) / np.sum(np.exp(p1_raw_local), axis=1).reshape(-1, 1)
...@@ -520,7 +469,7 @@ def test_classifier_custom_objective(output, task, cluster): ...@@ -520,7 +469,7 @@ def test_classifier_custom_objective(output, task, cluster):
def test_machines_to_worker_map_unparseable_host_names(): def test_machines_to_worker_map_unparseable_host_names():
workers = {'0.0.0.1:80': {}, '0.0.0.2:80': {}} workers = {"0.0.0.1:80": {}, "0.0.0.2:80": {}}
machines = "0.0.0.1:80,0.0.0.2:80" machines = "0.0.0.1:80,0.0.0.2:80"
with pytest.raises(ValueError, match="Could not parse host name from worker address '0.0.0.1:80'"): with pytest.raises(ValueError, match="Could not parse host name from worker address '0.0.0.1:80'"):
lgb.dask._machines_to_worker_map(machines=machines, worker_addresses=workers.keys()) lgb.dask._machines_to_worker_map(machines=machines, worker_addresses=workers.keys())
...@@ -528,18 +477,13 @@ def test_machines_to_worker_map_unparseable_host_names(): ...@@ -528,18 +477,13 @@ def test_machines_to_worker_map_unparseable_host_names():
def test_training_does_not_fail_on_port_conflicts(cluster): def test_training_does_not_fail_on_port_conflicts(cluster):
with Client(cluster) as client: with Client(cluster) as client:
_, _, _, _, dX, dy, dw, _ = _create_data('binary-classification', output='array') _, _, _, _, dX, dy, dw, _ = _create_data("binary-classification", output="array")
lightgbm_default_port = 12400 lightgbm_default_port = 12400
workers_hostname = _get_workers_hostname(cluster) workers_hostname = _get_workers_hostname(cluster)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind((workers_hostname, lightgbm_default_port)) s.bind((workers_hostname, lightgbm_default_port))
dask_classifier = lgb.DaskLGBMClassifier( dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, n_estimators=5, num_leaves=5)
client=client,
time_out=5,
n_estimators=5,
num_leaves=5
)
for _ in range(5): for _ in range(5):
dask_classifier.fit( dask_classifier.fit(
X=dX, X=dX,
...@@ -549,15 +493,12 @@ def test_training_does_not_fail_on_port_conflicts(cluster): ...@@ -549,15 +493,12 @@ def test_training_does_not_fail_on_port_conflicts(cluster):
assert dask_classifier.booster_ assert dask_classifier.booster_
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize('boosting_type', boosting_types) @pytest.mark.parametrize("boosting_type", boosting_types)
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms) @pytest.mark.parametrize("tree_learner", distributed_training_algorithms)
def test_regressor(output, boosting_type, tree_learner, cluster): def test_regressor(output, boosting_type, tree_learner, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
objective='regression',
output=output
)
params = { params = {
"boosting_type": boosting_type, "boosting_type": boosting_type,
...@@ -565,18 +506,15 @@ def test_regressor(output, boosting_type, tree_learner, cluster): ...@@ -565,18 +506,15 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
"num_leaves": 31, "num_leaves": 31,
"n_estimators": 20, "n_estimators": 20,
} }
if boosting_type == 'rf': if boosting_type == "rf":
params.update({ params.update(
'bagging_freq': 1, {
'bagging_fraction': 0.9, "bagging_freq": 1,
}) "bagging_fraction": 0.9,
}
)
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree=tree_learner, **params)
client=client,
time_out=5,
tree=tree_learner,
**params
)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
p1 = dask_regressor.predict(dX) p1 = dask_regressor.predict(dX)
p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)
...@@ -603,16 +541,13 @@ def test_regressor(output, boosting_type, tree_learner, cluster): ...@@ -603,16 +541,13 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape # pref_leaf values should have the right shape
# and values that look like valid tree nodes # and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute() pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == ( assert pred_leaf_vals.shape == (X.shape[0], dask_regressor.booster_.num_trees())
X.shape[0], assert np.max(pred_leaf_vals) <= params["num_leaves"]
dask_regressor.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert np.min(pred_leaf_vals) >= 0 assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"]
assert_eq(p1, y, rtol=0.5, atol=50.) assert_eq(p1, y, rtol=0.5, atol=50.0)
assert_eq(p2, y, rtol=0.5, atol=50.) assert_eq(p2, y, rtol=0.5, atol=50.0)
# extra predict() parameters should be passed through correctly # extra predict() parameters should be passed through correctly
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
...@@ -620,36 +555,22 @@ def test_regressor(output, boosting_type, tree_learner, cluster): ...@@ -620,36 +555,22 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
cat_cols = [ cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
tree_df = dask_regressor.booster_.trees_to_dataframe() tree_df = dask_regressor.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0 assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
def test_regressor_pred_contrib(output, cluster): def test_regressor_pred_contrib(output, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
objective='regression',
output=output
)
params = { params = {"n_estimators": 10, "num_leaves": 10}
"n_estimators": 10,
"num_leaves": 10
}
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner="data", **params)
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute() preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute()
...@@ -668,39 +589,23 @@ def test_regressor_pred_contrib(output, cluster): ...@@ -668,39 +589,23 @@ def test_regressor_pred_contrib(output, cluster):
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
cat_cols = [ cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
tree_df = dask_regressor.booster_.trees_to_dataframe() tree_df = dask_regressor.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0 assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize('alpha', [.1, .5, .9]) @pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9])
def test_regressor_quantile(output, alpha, cluster): def test_regressor_quantile(output, alpha, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
objective='regression',
output=output
)
params = { params = {"objective": "quantile", "alpha": alpha, "random_state": 42, "n_estimators": 10, "num_leaves": 10}
"objective": "quantile",
"alpha": alpha,
"random_state": 42,
"n_estimators": 10,
"num_leaves": 10
}
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(client=client, tree_learner_type="data_parallel", **params)
client=client,
tree_learner_type='data_parallel',
**params
)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
p1 = dask_regressor.predict(dX).compute() p1 = dask_regressor.predict(dX).compute()
q1 = np.count_nonzero(y < p1) / y.shape[0] q1 = np.count_nonzero(y < p1) / y.shape[0]
...@@ -716,37 +621,22 @@ def test_regressor_quantile(output, alpha, cluster): ...@@ -716,37 +621,22 @@ def test_regressor_quantile(output, alpha, cluster):
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
cat_cols = [ cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
tree_df = dask_regressor.booster_.trees_to_dataframe() tree_df = dask_regressor.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0 assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
def test_regressor_custom_objective(output, cluster): def test_regressor_custom_objective(output, cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
objective='regression',
output=output
)
params = { params = {"n_estimators": 10, "num_leaves": 10, "objective": _objective_least_squares}
"n_estimators": 10,
"num_leaves": 10,
"objective": _objective_least_squares
}
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner="data", **params)
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
dask_regressor_local = dask_regressor.to_local() dask_regressor_local = dask_regressor.to_local()
p1 = dask_regressor.predict(dX) p1 = dask_regressor.predict(dX)
...@@ -772,34 +662,26 @@ def test_regressor_custom_objective(output, cluster): ...@@ -772,34 +662,26 @@ def test_regressor_custom_objective(output, cluster):
assert_eq(p1, p1_local) assert_eq(p1, p1_local)
# predictions should be better than random # predictions should be better than random
assert_precision = {"rtol": 0.5, "atol": 50.} assert_precision = {"rtol": 0.5, "atol": 50.0}
assert_eq(p1, y, **assert_precision) assert_eq(p1, y, **assert_precision)
assert_eq(p2, y, **assert_precision) assert_eq(p2, y, **assert_precision)
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical']) @pytest.mark.parametrize("output", ["array", "dataframe", "dataframe-with-categorical"])
@pytest.mark.parametrize('group', [None, group_sizes]) @pytest.mark.parametrize("group", [None, group_sizes])
@pytest.mark.parametrize('boosting_type', boosting_types) @pytest.mark.parametrize("boosting_type", boosting_types)
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms) @pytest.mark.parametrize("tree_learner", distributed_training_algorithms)
def test_ranker(output, group, boosting_type, tree_learner, cluster): def test_ranker(output, group, boosting_type, tree_learner, cluster):
with Client(cluster) as client: with Client(cluster) as client:
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking', objective="ranking", output=output, group=group, n_features=1, n_informative=1
output=output,
group=group,
n_features=1,
n_informative=1
) )
else: else:
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(objective="ranking", output=output, group=group)
objective='ranking',
output=output,
group=group
)
# rebalance small dask.Array dataset for better performance. # rebalance small dask.Array dataset for better performance.
if output == 'array': if output == "array":
dX = dX.persist() dX = dX.persist()
dy = dy.persist() dy = dy.persist()
dw = dw.persist() dw = dw.persist()
...@@ -814,20 +696,17 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster): ...@@ -814,20 +696,17 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
"random_state": 42, "random_state": 42,
"n_estimators": 50, "n_estimators": 50,
"num_leaves": 20, "num_leaves": 20,
"min_child_samples": 1 "min_child_samples": 1,
} }
if boosting_type == 'rf': if boosting_type == "rf":
params.update({ params.update(
'bagging_freq': 1, {
'bagging_fraction': 0.9, "bagging_freq": 1,
}) "bagging_fraction": 0.9,
}
dask_ranker = lgb.DaskLGBMRanker( )
client=client,
time_out=5, dask_ranker = lgb.DaskLGBMRanker(client=client, time_out=5, tree_learner_type=tree_learner, **params)
tree_learner_type=tree_learner,
**params
)
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = dask_ranker.predict(dX)
rnkvec_dask = rnkvec_dask.compute() rnkvec_dask = rnkvec_dask.compute()
...@@ -835,11 +714,7 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster): ...@@ -835,11 +714,7 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
p1_raw = dask_ranker.predict(dX, raw_score=True).compute() p1_raw = dask_ranker.predict(dX, raw_score=True).compute()
p1_first_iter_raw = dask_ranker.predict(dX, start_iteration=0, num_iteration=1, raw_score=True).compute() p1_first_iter_raw = dask_ranker.predict(dX, start_iteration=0, num_iteration=1, raw_score=True).compute()
p1_early_stop_raw = dask_ranker.predict( p1_early_stop_raw = dask_ranker.predict(
dX, dX, pred_early_stop=True, pred_early_stop_margin=1.0, pred_early_stop_freq=2, raw_score=True
pred_early_stop=True,
pred_early_stop_margin=1.0,
pred_early_stop_freq=2,
raw_score=True
).compute() ).compute()
rnkvec_dask_local = dask_ranker.to_local().predict(X) rnkvec_dask_local = dask_ranker.to_local().predict(X)
...@@ -864,47 +739,33 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster): ...@@ -864,47 +739,33 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape # pref_leaf values should have the right shape
# and values that look like valid tree nodes # and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute() pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == ( assert pred_leaf_vals.shape == (X.shape[0], dask_ranker.booster_.num_trees())
X.shape[0], assert np.max(pred_leaf_vals) <= params["num_leaves"]
dask_ranker.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert np.min(pred_leaf_vals) >= 0 assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"]
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
cat_cols = [ cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
tree_df = dask_ranker.booster_.trees_to_dataframe() tree_df = dask_ranker.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0 assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical']) @pytest.mark.parametrize("output", ["array", "dataframe", "dataframe-with-categorical"])
def test_ranker_custom_objective(output, cluster): def test_ranker_custom_objective(output, cluster):
with Client(cluster) as client: with Client(cluster) as client:
if output == 'dataframe-with-categorical': if output == "dataframe-with-categorical":
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking', objective="ranking", output=output, group=group_sizes, n_features=1, n_informative=1
output=output,
group=group_sizes,
n_features=1,
n_informative=1
) )
else: else:
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(objective="ranking", output=output, group=group_sizes)
objective='ranking',
output=output,
group=group_sizes
)
# rebalance small dask.Array dataset for better performance. # rebalance small dask.Array dataset for better performance.
if output == 'array': if output == "array":
dX = dX.persist() dX = dX.persist()
dy = dy.persist() dy = dy.persist()
dw = dw.persist() dw = dw.persist()
...@@ -917,15 +778,10 @@ def test_ranker_custom_objective(output, cluster): ...@@ -917,15 +778,10 @@ def test_ranker_custom_objective(output, cluster):
"n_estimators": 50, "n_estimators": 50,
"num_leaves": 20, "num_leaves": 20,
"min_child_samples": 1, "min_child_samples": 1,
"objective": _objective_least_squares "objective": _objective_least_squares,
} }
dask_ranker = lgb.DaskLGBMRanker( dask_ranker = lgb.DaskLGBMRanker(client=client, time_out=5, tree_learner_type="data", **params)
client=client,
time_out=5,
tree_learner_type="data",
**params
)
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
rnkvec_dask = dask_ranker.predict(dX).compute() rnkvec_dask = dask_ranker.predict(dX).compute()
dask_ranker_local = dask_ranker.to_local() dask_ranker_local = dask_ranker.to_local()
...@@ -946,13 +802,13 @@ def test_ranker_custom_objective(output, cluster): ...@@ -946,13 +802,13 @@ def test_ranker_custom_objective(output, cluster):
assert callable(dask_ranker_local.objective_) assert callable(dask_ranker_local.objective_)
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize('eval_sizes', [[0.5, 1, 1.5], [0]]) @pytest.mark.parametrize("eval_sizes", [[0.5, 1, 1.5], [0]])
@pytest.mark.parametrize('eval_names_prefix', ['specified', None]) @pytest.mark.parametrize("eval_names_prefix", ["specified", None])
def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, cluster): def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, cluster):
if task == 'ranking' and output == 'scipy_csr_matrix': if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip('LGBMRanker is not currently tested on sparse matrices') pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster) as client: with Client(cluster) as client:
# Use larger trainset to prevent premature stopping due to zero loss, causing num_trees() < n_estimators. # Use larger trainset to prevent premature stopping due to zero loss, causing num_trees() < n_estimators.
...@@ -966,36 +822,33 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, ...@@ -966,36 +822,33 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_init_score = None eval_init_score = None
if eval_names_prefix: if eval_names_prefix:
eval_names = [f'{eval_names_prefix}_{i}' for i in range(len(eval_sizes))] eval_names = [f"{eval_names_prefix}_{i}" for i in range(len(eval_sizes))]
else: else:
eval_names = None eval_names = None
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(
objective=task, objective=task, n_samples=n_samples, output=output, chunk_size=chunk_size
n_samples=n_samples,
output=output,
chunk_size=chunk_size
) )
if task == 'ranking': if task == "ranking":
eval_metrics = ['ndcg'] eval_metrics = ["ndcg"]
eval_at = (5, 6) eval_at = (5, 6)
eval_metric_names = [f'ndcg@{k}' for k in eval_at] eval_metric_names = [f"ndcg@{k}" for k in eval_at]
eval_group = [] eval_group = []
else: else:
# test eval_class_weight, eval_init_score on binary-classification task. # test eval_class_weight, eval_init_score on binary-classification task.
# Note: objective's default `metric` will be evaluated in evals_result_ in addition to all eval_metrics. # Note: objective's default `metric` will be evaluated in evals_result_ in addition to all eval_metrics.
if task == 'binary-classification': if task == "binary-classification":
eval_metrics = ['binary_error', 'auc'] eval_metrics = ["binary_error", "auc"]
eval_metric_names = ['binary_logloss', 'binary_error', 'auc'] eval_metric_names = ["binary_logloss", "binary_error", "auc"]
eval_class_weight = [] eval_class_weight = []
eval_init_score = [] eval_init_score = []
elif task == 'multiclass-classification': elif task == "multiclass-classification":
eval_metrics = ['multi_error'] eval_metrics = ["multi_error"]
eval_metric_names = ['multi_logloss', 'multi_error'] eval_metric_names = ["multi_logloss", "multi_error"]
elif task == 'regression': elif task == "regression":
eval_metrics = ['l1'] eval_metrics = ["l1"]
eval_metric_names = ['l2', 'l1'] eval_metric_names = ["l2", "l1"]
# create eval_sets by creating new datasets or copying training data. # create eval_sets by creating new datasets or copying training data.
for eval_size in eval_sizes: for eval_size in eval_sizes:
...@@ -1008,23 +861,20 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, ...@@ -1008,23 +861,20 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
else: else:
n_eval_samples = max(chunk_size, int(n_samples * eval_size)) n_eval_samples = max(chunk_size, int(n_samples * eval_size))
_, y_e, _, _, dX_e, dy_e, dw_e, dg_e = _create_data( _, y_e, _, _, dX_e, dy_e, dw_e, dg_e = _create_data(
objective=task, objective=task, n_samples=n_eval_samples, output=output, chunk_size=chunk_size
n_samples=n_eval_samples,
output=output,
chunk_size=chunk_size
) )
eval_set.append((dX_e, dy_e)) eval_set.append((dX_e, dy_e))
eval_sample_weight.append(dw_e) eval_sample_weight.append(dw_e)
if task == 'ranking': if task == "ranking":
eval_group.append(dg_e) eval_group.append(dg_e)
if task == 'binary-classification': if task == "binary-classification":
n_neg = np.sum(y_e == 0) n_neg = np.sum(y_e == 0)
n_pos = np.sum(y_e == 1) n_pos = np.sum(y_e == 1)
eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg}) eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg})
init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e))) init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e)))
if 'dataframe' in output: if "dataframe" in output:
d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size)) d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size))
else: else:
d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size)) d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size))
...@@ -1032,44 +882,36 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, ...@@ -1032,44 +882,36 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_init_score.append(d_init_score) eval_init_score.append(d_init_score)
fit_trees = 50 fit_trees = 50
params = { params = {"random_state": 42, "n_estimators": fit_trees, "num_leaves": 2}
"random_state": 42,
"n_estimators": fit_trees,
"num_leaves": 2
}
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
dask_model = model_factory( dask_model = model_factory(client=client, **params)
client=client,
**params
)
fit_params = { fit_params = {
'X': dX, "X": dX,
'y': dy, "y": dy,
'eval_set': eval_set, "eval_set": eval_set,
'eval_names': eval_names, "eval_names": eval_names,
'eval_sample_weight': eval_sample_weight, "eval_sample_weight": eval_sample_weight,
'eval_init_score': eval_init_score, "eval_init_score": eval_init_score,
'eval_metric': eval_metrics "eval_metric": eval_metrics,
} }
if task == 'ranking': if task == "ranking":
fit_params.update( fit_params.update({"group": dg, "eval_group": eval_group, "eval_at": eval_at})
{'group': dg, elif task == "binary-classification":
'eval_group': eval_group, fit_params.update({"eval_class_weight": eval_class_weight})
'eval_at': eval_at}
)
elif task == 'binary-classification':
fit_params.update({'eval_class_weight': eval_class_weight})
if eval_sizes == [0]: if eval_sizes == [0]:
with pytest.warns(UserWarning, match='Worker (.*) was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable.'): with pytest.warns(
UserWarning,
match="Worker (.*) was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable.",
):
dask_model.fit(**fit_params) dask_model.fit(**fit_params)
else: else:
dask_model = dask_model.fit(**fit_params) dask_model = dask_model.fit(**fit_params)
# total number of trees scales up for ova classifier. # total number of trees scales up for ova classifier.
if task == 'multiclass-classification': if task == "multiclass-classification":
model_trees = fit_trees * dask_model.n_classes_ model_trees = fit_trees * dask_model.n_classes_
else: else:
model_trees = fit_trees model_trees = fit_trees
...@@ -1098,67 +940,45 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, ...@@ -1098,67 +940,45 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
assert len(evals_result[eval_name][metric]) == fit_trees assert len(evals_result[eval_name][metric]) == fit_trees
@pytest.mark.parametrize('task', ['binary-classification', 'regression', 'ranking']) @pytest.mark.parametrize("task", ["binary-classification", "regression", "ranking"])
def test_eval_set_with_custom_eval_metric(task, cluster): def test_eval_set_with_custom_eval_metric(task, cluster):
with Client(cluster) as client: with Client(cluster) as client:
n_samples = 1000 n_samples = 1000
n_eval_samples = int(n_samples * 0.5) n_eval_samples = int(n_samples * 0.5)
chunk_size = 10 chunk_size = 10
output = 'array' output = "array"
X, y, w, g, dX, dy, dw, dg = _create_data( X, y, w, g, dX, dy, dw, dg = _create_data(
objective=task, objective=task, n_samples=n_samples, output=output, chunk_size=chunk_size
n_samples=n_samples,
output=output,
chunk_size=chunk_size
) )
_, _, _, _, dX_e, dy_e, _, dg_e = _create_data( _, _, _, _, dX_e, dy_e, _, dg_e = _create_data(
objective=task, objective=task, n_samples=n_eval_samples, output=output, chunk_size=chunk_size
n_samples=n_eval_samples,
output=output,
chunk_size=chunk_size
) )
if task == 'ranking': if task == "ranking":
eval_at = (5, 6) eval_at = (5, 6)
eval_metrics = ['ndcg', _constant_metric] eval_metrics = ["ndcg", _constant_metric]
eval_metric_names = [f'ndcg@{k}' for k in eval_at] + ['constant_metric'] eval_metric_names = [f"ndcg@{k}" for k in eval_at] + ["constant_metric"]
elif task == 'binary-classification': elif task == "binary-classification":
eval_metrics = ['binary_error', 'auc', _constant_metric] eval_metrics = ["binary_error", "auc", _constant_metric]
eval_metric_names = ['binary_logloss', 'binary_error', 'auc', 'constant_metric'] eval_metric_names = ["binary_logloss", "binary_error", "auc", "constant_metric"]
else: else:
eval_metrics = ['l1', _constant_metric] eval_metrics = ["l1", _constant_metric]
eval_metric_names = ['l2', 'l1', 'constant_metric'] eval_metric_names = ["l2", "l1", "constant_metric"]
fit_trees = 50 fit_trees = 50
params = { params = {"random_state": 42, "n_estimators": fit_trees, "num_leaves": 2}
"random_state": 42,
"n_estimators": fit_trees,
"num_leaves": 2
}
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
dask_model = model_factory( dask_model = model_factory(client=client, **params)
client=client,
**params
)
eval_set = [(dX_e, dy_e)] eval_set = [(dX_e, dy_e)]
fit_params = { fit_params = {"X": dX, "y": dy, "eval_set": eval_set, "eval_metric": eval_metrics}
'X': dX, if task == "ranking":
'y': dy, fit_params.update({"group": dg, "eval_group": [dg_e], "eval_at": eval_at})
'eval_set': eval_set,
'eval_metric': eval_metrics
}
if task == 'ranking':
fit_params.update(
{'group': dg,
'eval_group': [dg_e],
'eval_at': eval_at}
)
dask_model = dask_model.fit(**fit_params) dask_model = dask_model.fit(**fit_params)
eval_name = 'valid_0' eval_name = "valid_0"
evals_result = dask_model.evals_result_ evals_result = dask_model.evals_result_
assert len(evals_result) == 1 assert len(evals_result) == 1
assert eval_name in evals_result assert eval_name in evals_result
...@@ -1167,29 +987,21 @@ def test_eval_set_with_custom_eval_metric(task, cluster): ...@@ -1167,29 +987,21 @@ def test_eval_set_with_custom_eval_metric(task, cluster):
assert metric in evals_result[eval_name] assert metric in evals_result[eval_name]
assert len(evals_result[eval_name][metric]) == fit_trees assert len(evals_result[eval_name][metric]) == fit_trees
np.testing.assert_allclose(evals_result[eval_name]['constant_metric'], 0.708) np.testing.assert_allclose(evals_result[eval_name]["constant_metric"], 0.708)
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
def test_training_works_if_client_not_provided_or_set_after_construction(task, cluster): def test_training_works_if_client_not_provided_or_set_after_construction(task, cluster):
with Client(cluster) as client: with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data( _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output="array", group=None)
objective=task,
output='array',
group=None
)
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
params = { params = {"time_out": 5, "n_estimators": 1, "num_leaves": 2}
"time_out": 5,
"n_estimators": 1,
"num_leaves": 2
}
# should be able to use the class without specifying a client # should be able to use the class without specifying a client
dask_model = model_factory(**params) dask_model = model_factory(**params)
assert dask_model.client is None assert dask_model.client is None
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'): with pytest.raises(lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"):
dask_model.client_ dask_model.client_
dask_model.fit(dX, dy, group=dg) dask_model.fit(dX, dy, group=dg)
...@@ -1213,7 +1025,7 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c ...@@ -1213,7 +1025,7 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
dask_model.set_params(client=client) dask_model.set_params(client=client)
assert dask_model.client == client assert dask_model.client == client
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'): with pytest.raises(lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"):
dask_model.client_ dask_model.client_
dask_model.fit(dX, dy, group=dg) dask_model.fit(dX, dy, group=dg)
...@@ -1233,34 +1045,23 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c ...@@ -1233,34 +1045,23 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
local_model.client_ local_model.client_
@pytest.mark.parametrize('serializer', ['pickle', 'joblib', 'cloudpickle']) @pytest.mark.parametrize("serializer", ["pickle", "joblib", "cloudpickle"])
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize('set_client', [True, False]) @pytest.mark.parametrize("set_client", [True, False])
def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(serializer, task, set_client, tmp_path, cluster, cluster2): def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(
serializer, task, set_client, tmp_path, cluster, cluster2
):
with Client(cluster) as client1: with Client(cluster) as client1:
# data on cluster1 # data on cluster1
X_1, _, _, _, dX_1, dy_1, _, dg_1 = _create_data( X_1, _, _, _, dX_1, dy_1, _, dg_1 = _create_data(objective=task, output="array", group=None)
objective=task,
output='array',
group=None
)
with Client(cluster2) as client2: with Client(cluster2) as client2:
# create identical data on cluster2 # create identical data on cluster2
X_2, _, _, _, dX_2, dy_2, _, dg_2 = _create_data( X_2, _, _, _, dX_2, dy_2, _, dg_2 = _create_data(objective=task, output="array", group=None)
objective=task,
output='array',
group=None
)
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
params = { params = {"time_out": 5, "n_estimators": 1, "num_leaves": 2}
"time_out": 5,
"n_estimators": 1,
"num_leaves": 2
}
# at this point, the result of default_client() is client2 since it was the most recently # at this point, the result of default_client() is client2 since it was the most recently
# created. So setting client to client1 here to test that you can select a non-default client # created. So setting client to client1 here to test that you can select a non-default client
...@@ -1277,33 +1078,21 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici ...@@ -1277,33 +1078,21 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
else: else:
assert dask_model.client is None assert dask_model.client is None
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'): with pytest.raises(
lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"
):
dask_model.client_ dask_model.client_
assert "client" not in local_model.get_params() assert "client" not in local_model.get_params()
assert getattr(local_model, "client", None) is None assert getattr(local_model, "client", None) is None
tmp_file = tmp_path / "model-1.pkl" tmp_file = tmp_path / "model-1.pkl"
pickle_obj( pickle_obj(obj=dask_model, filepath=tmp_file, serializer=serializer)
obj=dask_model, model_from_disk = unpickle_obj(filepath=tmp_file, serializer=serializer)
filepath=tmp_file,
serializer=serializer
)
model_from_disk = unpickle_obj(
filepath=tmp_file,
serializer=serializer
)
local_tmp_file = tmp_path / "local-model-1.pkl" local_tmp_file = tmp_path / "local-model-1.pkl"
pickle_obj( pickle_obj(obj=local_model, filepath=local_tmp_file, serializer=serializer)
obj=local_model, local_model_from_disk = unpickle_obj(filepath=local_tmp_file, serializer=serializer)
filepath=local_tmp_file,
serializer=serializer
)
local_model_from_disk = unpickle_obj(
filepath=local_tmp_file,
serializer=serializer
)
assert model_from_disk.client is None assert model_from_disk.client is None
...@@ -1312,7 +1101,9 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici ...@@ -1312,7 +1101,9 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
else: else:
assert dask_model.client is None assert dask_model.client is None
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'): with pytest.raises(
lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"
):
dask_model.client_ dask_model.client_
# client will always be None after unpickling # client will always be None after unpickling
...@@ -1340,26 +1131,12 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici ...@@ -1340,26 +1131,12 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
local_model.client_ local_model.client_
tmp_file2 = tmp_path / "model-2.pkl" tmp_file2 = tmp_path / "model-2.pkl"
pickle_obj( pickle_obj(obj=dask_model, filepath=tmp_file2, serializer=serializer)
obj=dask_model, fitted_model_from_disk = unpickle_obj(filepath=tmp_file2, serializer=serializer)
filepath=tmp_file2,
serializer=serializer
)
fitted_model_from_disk = unpickle_obj(
filepath=tmp_file2,
serializer=serializer
)
local_tmp_file2 = tmp_path / "local-model-2.pkl" local_tmp_file2 = tmp_path / "local-model-2.pkl"
pickle_obj( pickle_obj(obj=local_model, filepath=local_tmp_file2, serializer=serializer)
obj=local_model, local_fitted_model_from_disk = unpickle_obj(filepath=local_tmp_file2, serializer=serializer)
filepath=local_tmp_file2,
serializer=serializer
)
local_fitted_model_from_disk = unpickle_obj(
filepath=local_tmp_file2,
serializer=serializer
)
if set_client: if set_client:
assert dask_model.client == client1 assert dask_model.client == client1
...@@ -1405,35 +1182,25 @@ def test_warns_and_continues_on_unrecognized_tree_learner(cluster): ...@@ -1405,35 +1182,25 @@ def test_warns_and_continues_on_unrecognized_tree_learner(cluster):
X = da.random.random((1e3, 10)) X = da.random.random((1e3, 10))
y = da.random.random((1e3, 1)) y = da.random.random((1e3, 1))
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(
client=client, client=client, time_out=5, tree_learner="some-nonsense-value", n_estimators=1, num_leaves=2
time_out=5,
tree_learner='some-nonsense-value',
n_estimators=1,
num_leaves=2
) )
with pytest.warns(UserWarning, match='Parameter tree_learner set to some-nonsense-value'): with pytest.warns(UserWarning, match="Parameter tree_learner set to some-nonsense-value"):
dask_regressor = dask_regressor.fit(X, y) dask_regressor = dask_regressor.fit(X, y)
assert dask_regressor.fitted_ assert dask_regressor.fitted_
@pytest.mark.parametrize('tree_learner', ['data_parallel', 'voting_parallel']) @pytest.mark.parametrize("tree_learner", ["data_parallel", "voting_parallel"])
def test_training_respects_tree_learner_aliases(tree_learner, cluster): def test_training_respects_tree_learner_aliases(tree_learner, cluster):
with Client(cluster) as client: with Client(cluster) as client:
task = 'regression' task = "regression"
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output='array') _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output="array")
dask_factory = task_to_dask_factory[task] dask_factory = task_to_dask_factory[task]
dask_model = dask_factory( dask_model = dask_factory(client=client, tree_learner=tree_learner, time_out=5, n_estimators=10, num_leaves=15)
client=client,
tree_learner=tree_learner,
time_out=5,
n_estimators=10,
num_leaves=15
)
dask_model.fit(dX, dy, sample_weight=dw, group=dg) dask_model.fit(dX, dy, sample_weight=dw, group=dg)
assert dask_model.fitted_ assert dask_model.fitted_
assert dask_model.get_params()['tree_learner'] == tree_learner assert dask_model.get_params()["tree_learner"] == tree_learner
def test_error_on_feature_parallel_tree_learner(cluster): def test_error_on_feature_parallel_tree_learner(cluster):
...@@ -1444,39 +1211,30 @@ def test_error_on_feature_parallel_tree_learner(cluster): ...@@ -1444,39 +1211,30 @@ def test_error_on_feature_parallel_tree_learner(cluster):
_ = wait([X, y]) _ = wait([X, y])
client.rebalance() client.rebalance()
dask_regressor = lgb.DaskLGBMRegressor( dask_regressor = lgb.DaskLGBMRegressor(
client=client, client=client, time_out=5, tree_learner="feature_parallel", n_estimators=1, num_leaves=2
time_out=5,
tree_learner='feature_parallel',
n_estimators=1,
num_leaves=2
) )
with pytest.raises(lgb.basic.LightGBMError, match='Do not support feature parallel in c api'): with pytest.raises(lgb.basic.LightGBMError, match="Do not support feature parallel in c api"):
dask_regressor = dask_regressor.fit(X, y) dask_regressor = dask_regressor.fit(X, y)
def test_errors(cluster): def test_errors(cluster):
with Client(cluster) as client: with Client(cluster) as client:
def f(part): def f(part):
raise Exception('foo') raise Exception("foo")
df = dd.demo.make_timeseries() df = dd.demo.make_timeseries()
df = df.map_partitions(f, meta=df._meta) df = df.map_partitions(f, meta=df._meta)
with pytest.raises(Exception) as info: with pytest.raises(Exception) as info:
lgb.dask._train( lgb.dask._train(client=client, data=df, label=df.x, params={}, model_factory=lgb.LGBMClassifier)
client=client, assert "foo" in str(info.value)
data=df,
label=df.x,
params={},
model_factory=lgb.LGBMClassifier
)
assert 'foo' in str(info.value)
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, output, cluster_three_workers): def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, output, cluster_three_workers):
if task == 'ranking' and output == 'scipy_csr_matrix': if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip('LGBMRanker is not currently tested on sparse matrices') pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster_three_workers) as client: with Client(cluster_three_workers) as client:
_, y, _, _, dX, dy, dw, dg = _create_data( _, y, _, _, dX, dy, dw, dg = _create_data(
...@@ -1489,7 +1247,7 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu ...@@ -1489,7 +1247,7 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
dask_model_factory = task_to_dask_factory[task] dask_model_factory = task_to_dask_factory[task]
workers = list(client.scheduler_info()['workers'].keys()) workers = list(client.scheduler_info()["workers"].keys())
assert len(workers) == 3 assert len(workers) == 3
first_two_workers = workers[:2] first_two_workers = workers[:2]
...@@ -1506,33 +1264,28 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu ...@@ -1506,33 +1264,28 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
assert len(workers_with_data) == 2 assert len(workers_with_data) == 2
params = { params = {
'time_out': 5, "time_out": 5,
'random_state': 42, "random_state": 42,
'num_leaves': 10, "num_leaves": 10,
'n_estimators': 20, "n_estimators": 20,
} }
dask_model = dask_model_factory(tree='data', client=client, **params) dask_model = dask_model_factory(tree="data", client=client, **params)
dask_model.fit(dX, dy, group=dg, sample_weight=dw) dask_model.fit(dX, dy, group=dg, sample_weight=dw)
dask_preds = dask_model.predict(dX).compute() dask_preds = dask_model.predict(dX).compute()
if task == 'regression': if task == "regression":
score = r2_score(y, dask_preds) score = r2_score(y, dask_preds)
elif task.endswith('classification'): elif task.endswith("classification"):
score = accuracy_score(y, dask_preds) score = accuracy_score(y, dask_preds)
else: else:
score = spearmanr(dask_preds, y).correlation score = spearmanr(dask_preds, y).correlation
assert score > 0.9 assert score > 0.9
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
def test_network_params_not_required_but_respected_if_given(task, listen_port, cluster): def test_network_params_not_required_but_respected_if_given(task, listen_port, cluster):
with Client(cluster) as client: with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data( _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output="array", chunk_size=10, group=None)
objective=task,
output='array',
chunk_size=10,
group=None
)
dask_model_factory = task_to_dask_factory[task] dask_model_factory = task_to_dask_factory[task]
...@@ -1547,11 +1300,11 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c ...@@ -1547,11 +1300,11 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
dask_model1.fit(dX, dy, group=dg) dask_model1.fit(dX, dy, group=dg)
assert dask_model1.fitted_ assert dask_model1.fitted_
params = dask_model1.get_params() params = dask_model1.get_params()
assert 'local_listen_port' not in params assert "local_listen_port" not in params
assert 'machines' not in params assert "machines" not in params
# model 2 - machines given # model 2 - machines given
workers = list(client.scheduler_info()['workers']) workers = list(client.scheduler_info()["workers"])
workers_hostname = _get_workers_hostname(cluster) workers_hostname = _get_workers_hostname(cluster)
remote_sockets, open_ports = lgb.dask._assign_open_ports_to_workers(client, workers) remote_sockets, open_ports = lgb.dask._assign_open_ports_to_workers(client, workers)
for s in remote_sockets.values(): for s in remote_sockets.values():
...@@ -1559,58 +1312,43 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c ...@@ -1559,58 +1312,43 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
dask_model2 = dask_model_factory( dask_model2 = dask_model_factory(
n_estimators=5, n_estimators=5,
num_leaves=5, num_leaves=5,
machines=",".join([ machines=",".join([f"{workers_hostname}:{port}" for port in open_ports.values()]),
f"{workers_hostname}:{port}"
for port in open_ports.values()
]),
) )
dask_model2.fit(dX, dy, group=dg) dask_model2.fit(dX, dy, group=dg)
assert dask_model2.fitted_ assert dask_model2.fitted_
params = dask_model2.get_params() params = dask_model2.get_params()
assert 'local_listen_port' not in params assert "local_listen_port" not in params
assert 'machines' in params assert "machines" in params
# model 3 - local_listen_port given # model 3 - local_listen_port given
# training should fail because LightGBM will try to use the same # training should fail because LightGBM will try to use the same
# port for multiple worker processes on the same machine # port for multiple worker processes on the same machine
dask_model3 = dask_model_factory( dask_model3 = dask_model_factory(n_estimators=5, num_leaves=5, local_listen_port=listen_port)
n_estimators=5,
num_leaves=5,
local_listen_port=listen_port
)
error_msg = "has multiple Dask worker processes running on it" error_msg = "has multiple Dask worker processes running on it"
with pytest.raises(lgb.basic.LightGBMError, match=error_msg): with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
dask_model3.fit(dX, dy, group=dg) dask_model3.fit(dX, dy, group=dg)
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
def test_machines_should_be_used_if_provided(task, cluster): def test_machines_should_be_used_if_provided(task, cluster):
pytest.skip("skipping due to timeout issues discussed in https://github.com/microsoft/LightGBM/issues/5390") pytest.skip("skipping due to timeout issues discussed in https://github.com/microsoft/LightGBM/issues/5390")
with Client(cluster) as client: with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data( _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output="array", chunk_size=10, group=None)
objective=task,
output='array',
chunk_size=10,
group=None
)
dask_model_factory = task_to_dask_factory[task] dask_model_factory = task_to_dask_factory[task]
# rebalance data to be sure that each worker has a piece of the data # rebalance data to be sure that each worker has a piece of the data
client.rebalance() client.rebalance()
n_workers = len(client.scheduler_info()['workers']) n_workers = len(client.scheduler_info()["workers"])
assert n_workers > 1 assert n_workers > 1
workers_hostname = _get_workers_hostname(cluster) workers_hostname = _get_workers_hostname(cluster)
open_ports = lgb.dask._find_n_open_ports(n_workers) open_ports = lgb.dask._find_n_open_ports(n_workers)
dask_model = dask_model_factory( dask_model = dask_model_factory(
n_estimators=5, n_estimators=5,
num_leaves=5, num_leaves=5,
machines=",".join([ machines=",".join([f"{workers_hostname}:{port}" for port in open_ports]),
f"{workers_hostname}:{port}"
for port in open_ports
]),
) )
# test that "machines" is actually respected by creating a socket that uses # test that "machines" is actually respected by creating a socket that uses
...@@ -1626,12 +1364,7 @@ def test_machines_should_be_used_if_provided(task, cluster): ...@@ -1626,12 +1364,7 @@ def test_machines_should_be_used_if_provided(task, cluster):
# an informative error should be raised if "machines" has duplicates # an informative error should be raised if "machines" has duplicates
one_open_port = lgb.dask._find_n_open_ports(1) one_open_port = lgb.dask._find_n_open_ports(1)
dask_model.set_params( dask_model.set_params(machines=",".join([f"127.0.0.1:{one_open_port}" for _ in range(n_workers)]))
machines=",".join([
f"127.0.0.1:{one_open_port}"
for _ in range(n_workers)
])
)
with pytest.raises(ValueError, match="Found duplicates in 'machines'"): with pytest.raises(ValueError, match="Found duplicates in 'machines'"):
dask_model.fit(dX, dy, group=dg) dask_model.fit(dX, dy, group=dg)
...@@ -1641,8 +1374,8 @@ def test_machines_should_be_used_if_provided(task, cluster): ...@@ -1641,8 +1374,8 @@ def test_machines_should_be_used_if_provided(task, cluster):
[ [
(lgb.DaskLGBMClassifier, lgb.LGBMClassifier), (lgb.DaskLGBMClassifier, lgb.LGBMClassifier),
(lgb.DaskLGBMRegressor, lgb.LGBMRegressor), (lgb.DaskLGBMRegressor, lgb.LGBMRegressor),
(lgb.DaskLGBMRanker, lgb.LGBMRanker) (lgb.DaskLGBMRanker, lgb.LGBMRanker),
] ],
) )
def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg(classes): def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg(classes):
dask_spec = inspect.getfullargspec(classes[0]) dask_spec = inspect.getfullargspec(classes[0])
...@@ -1655,7 +1388,7 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except ...@@ -1655,7 +1388,7 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
# "client" should be the only different, and the final argument # "client" should be the only different, and the final argument
assert dask_spec.args[:-1] == sklearn_spec.args assert dask_spec.args[:-1] == sklearn_spec.args
assert dask_spec.defaults[:-1] == sklearn_spec.defaults assert dask_spec.defaults[:-1] == sklearn_spec.defaults
assert dask_spec.args[-1] == 'client' assert dask_spec.args[-1] == "client"
assert dask_spec.defaults[-1] is None assert dask_spec.defaults[-1] is None
...@@ -1668,18 +1401,18 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except ...@@ -1668,18 +1401,18 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
(lgb.DaskLGBMRegressor.fit, lgb.LGBMRegressor.fit), (lgb.DaskLGBMRegressor.fit, lgb.LGBMRegressor.fit),
(lgb.DaskLGBMRegressor.predict, lgb.LGBMRegressor.predict), (lgb.DaskLGBMRegressor.predict, lgb.LGBMRegressor.predict),
(lgb.DaskLGBMRanker.fit, lgb.LGBMRanker.fit), (lgb.DaskLGBMRanker.fit, lgb.LGBMRanker.fit),
(lgb.DaskLGBMRanker.predict, lgb.LGBMRanker.predict) (lgb.DaskLGBMRanker.predict, lgb.LGBMRanker.predict),
] ],
) )
def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods): def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
dask_spec = inspect.getfullargspec(methods[0]) dask_spec = inspect.getfullargspec(methods[0])
sklearn_spec = inspect.getfullargspec(methods[1]) sklearn_spec = inspect.getfullargspec(methods[1])
dask_params = inspect.signature(methods[0]).parameters dask_params = inspect.signature(methods[0]).parameters
sklearn_params = inspect.signature(methods[1]).parameters sklearn_params = inspect.signature(methods[1]).parameters
assert dask_spec.args == sklearn_spec.args[:len(dask_spec.args)] assert dask_spec.args == sklearn_spec.args[: len(dask_spec.args)]
assert dask_spec.varargs == sklearn_spec.varargs assert dask_spec.varargs == sklearn_spec.varargs
if sklearn_spec.varkw: if sklearn_spec.varkw:
assert dask_spec.varkw == sklearn_spec.varkw[:len(dask_spec.varkw)] assert dask_spec.varkw == sklearn_spec.varkw[: len(dask_spec.varkw)]
assert dask_spec.kwonlyargs == sklearn_spec.kwonlyargs assert dask_spec.kwonlyargs == sklearn_spec.kwonlyargs
assert dask_spec.kwonlydefaults == sklearn_spec.kwonlydefaults assert dask_spec.kwonlydefaults == sklearn_spec.kwonlydefaults
for param in dask_spec.args: for param in dask_spec.args:
...@@ -1687,14 +1420,10 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods): ...@@ -1687,14 +1420,10 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
assert dask_params[param].default == sklearn_params[param].default, error_msg assert dask_params[param].default == sklearn_params[param].default, error_msg
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task, cluster): def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task, cluster):
with Client(cluster): with Client(cluster):
_, _, _, _, dX, dy, dw, dg = _create_data( _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output="dataframe", group=None)
objective=task,
output='dataframe',
group=None
)
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
...@@ -1702,58 +1431,41 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task ...@@ -1702,58 +1431,41 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
dy_col_array = dy.reshape(-1, 1) dy_col_array = dy.reshape(-1, 1)
assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1 assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1
params = { params = {"n_estimators": 1, "num_leaves": 3, "random_state": 0, "time_out": 5}
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0,
'time_out': 5
}
model = model_factory(**params) model = model_factory(**params)
model.fit(dX, dy_col_array, sample_weight=dw, group=dg) model.fit(dX, dy_col_array, sample_weight=dw, group=dg)
assert model.fitted_ assert model.fitted_
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
def test_init_score(task, output, cluster): def test_init_score(task, output, cluster):
if task == 'ranking' and output == 'scipy_csr_matrix': if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip('LGBMRanker is not currently tested on sparse matrices') pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster) as client: with Client(cluster) as client:
_, _, _, _, dX, dy, dw, dg = _create_data( _, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None)
objective=task,
output=output,
group=None
)
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
params = { params = {"n_estimators": 1, "num_leaves": 2, "time_out": 5}
'n_estimators': 1,
'num_leaves': 2,
'time_out': 5
}
init_score = random.random() init_score = random.random()
size_factor = 1 size_factor = 1
if task == 'multiclass-classification': if task == "multiclass-classification":
size_factor = 3 # number of classes size_factor = 3 # number of classes
if output.startswith('dataframe'): if output.startswith("dataframe"):
init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size)) init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size))
else: else:
init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score)) init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
model = model_factory(client=client, **params) model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set # value of the root node is 0 when init_score is set
assert model.booster_.trees_to_dataframe()['value'][0] == 0 assert model.booster_.trees_to_dataframe()["value"][0] == 0
def sklearn_checks_to_run(): def sklearn_checks_to_run():
check_names = [ check_names = ["check_estimator_get_tags_default_keys", "check_get_params_invariance", "check_set_params"]
"check_estimator_get_tags_default_keys",
"check_get_params_invariance",
"check_set_params"
]
for check_name in check_names: for check_name in check_names:
check_func = getattr(sklearn_checks, check_name, None) check_func = getattr(sklearn_checks, check_name, None)
if check_func: if check_func:
...@@ -1782,79 +1494,58 @@ def test_parameters_default_constructible(estimator): ...@@ -1782,79 +1494,58 @@ def test_parameters_default_constructible(estimator):
sklearn_checks.check_parameters_default_constructible(name, Estimator) sklearn_checks.check_parameters_default_constructible(name, Estimator)
@pytest.mark.parametrize('task', tasks) @pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize('output', data_output) @pytest.mark.parametrize("output", data_output)
def test_predict_with_raw_score(task, output, cluster): def test_predict_with_raw_score(task, output, cluster):
if task == 'ranking' and output == 'scipy_csr_matrix': if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip('LGBMRanker is not currently tested on sparse matrices') pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster) as client: with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data( _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output=output, group=None)
objective=task,
output=output,
group=None
)
model_factory = task_to_dask_factory[task] model_factory = task_to_dask_factory[task]
params = { params = {"client": client, "n_estimators": 1, "num_leaves": 2, "time_out": 5, "min_sum_hessian": 0}
'client': client,
'n_estimators': 1,
'num_leaves': 2,
'time_out': 5,
'min_sum_hessian': 0
}
model = model_factory(**params) model = model_factory(**params)
model.fit(dX, dy, group=dg) model.fit(dX, dy, group=dg)
raw_predictions = model.predict(dX, raw_score=True).compute() raw_predictions = model.predict(dX, raw_score=True).compute()
trees_df = model.booster_.trees_to_dataframe() trees_df = model.booster_.trees_to_dataframe()
leaves_df = trees_df[trees_df.node_depth == 2] leaves_df = trees_df[trees_df.node_depth == 2]
if task == 'multiclass-classification': if task == "multiclass-classification":
for i in range(model.n_classes_): for i in range(model.n_classes_):
class_df = leaves_df[leaves_df.tree_index == i] class_df = leaves_df[leaves_df.tree_index == i]
assert set(raw_predictions[:, i]) == set(class_df['value']) assert set(raw_predictions[:, i]) == set(class_df["value"])
else: else:
assert set(raw_predictions) == set(leaves_df['value']) assert set(raw_predictions) == set(leaves_df["value"])
if task.endswith('classification'): if task.endswith("classification"):
pred_proba_raw = model.predict_proba(dX, raw_score=True).compute() pred_proba_raw = model.predict_proba(dX, raw_score=True).compute()
assert_eq(raw_predictions, pred_proba_raw) assert_eq(raw_predictions, pred_proba_raw)
def test_distributed_quantized_training(cluster): def test_distributed_quantized_training(cluster):
with Client(cluster) as client: with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data( X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output="array")
objective='regression',
output='array'
)
np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f") np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f")
params = { params = {
"boosting_type": 'gbdt', "boosting_type": "gbdt",
"n_estimators": 50, "n_estimators": 50,
"num_leaves": 31, "num_leaves": 31,
'use_quantized_grad': True, "use_quantized_grad": True,
'num_grad_quant_bins': 30, "num_grad_quant_bins": 30,
'quant_train_renew_leaf': True, "quant_train_renew_leaf": True,
'verbose': -1, "verbose": -1,
} }
quant_dask_classifier = lgb.DaskLGBMRegressor( quant_dask_classifier = lgb.DaskLGBMRegressor(client=client, time_out=5, **params)
client=client,
time_out=5,
**params
)
quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw) quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw)
quant_p1 = quant_dask_classifier.predict(dX) quant_p1 = quant_dask_classifier.predict(dX)
quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2)) quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2))
params["use_quantized_grad"] = False params["use_quantized_grad"] = False
dask_classifier = lgb.DaskLGBMRegressor( dask_classifier = lgb.DaskLGBMRegressor(client=client, time_out=5, **params)
client=client,
time_out=5,
**params
)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
p1 = dask_classifier.predict(dX) p1 = dask_classifier.predict(dX)
rmse = np.sqrt(np.mean((p1.compute() - y) ** 2)) rmse = np.sqrt(np.mean((p1.compute() - y) ** 2))
......
...@@ -28,7 +28,7 @@ def test_cpu_and_gpu_work(): ...@@ -28,7 +28,7 @@ def test_cpu_and_gpu_work():
params_gpu = params_cpu.copy() params_gpu = params_cpu.copy()
params_gpu["device"] = "gpu" params_gpu["device"] = "gpu"
# Double-precision floats are only supported on x86_64 with PoCL # Double-precision floats are only supported on x86_64 with PoCL
params_gpu["gpu_use_dp"] = (platform.machine() == "x86_64") params_gpu["gpu_use_dp"] = platform.machine() == "x86_64"
gpu_bst = lgb.train(params_gpu, data, num_boost_round=10) gpu_bst = lgb.train(params_gpu, data, num_boost_round=10)
gpu_score = log_loss(y, gpu_bst.predict(X)) gpu_score = log_loss(y, gpu_bst.predict(X))
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS ...@@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS
if MATPLOTLIB_INSTALLED: if MATPLOTLIB_INSTALLED:
import matplotlib import matplotlib
matplotlib.use('Agg')
matplotlib.use("Agg")
if GRAPHVIZ_INSTALLED: if GRAPHVIZ_INSTALLED:
import graphviz import graphviz
...@@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression ...@@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def breast_cancer_split(): def breast_cancer_split():
return train_test_split(*load_breast_cancer(return_X_y=True), return train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=1)
test_size=0.1, random_state=1)
def _categorical_data(category_values_lower_bound, category_values_upper_bound): def _categorical_data(category_values_lower_bound, category_values_upper_bound):
...@@ -41,51 +41,51 @@ def train_data(breast_cancer_split): ...@@ -41,51 +41,51 @@ def train_data(breast_cancer_split):
@pytest.fixture @pytest.fixture
def params(): def params():
return {"objective": "binary", return {"objective": "binary", "verbose": -1, "num_leaves": 3}
"verbose": -1,
"num_leaves": 3}
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed') @pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_importance(params, breast_cancer_split, train_data): def test_plot_importance(params, breast_cancer_split, train_data):
X_train, _, y_train, _ = breast_cancer_split X_train, _, y_train, _ = breast_cancer_split
gbm0 = lgb.train(params, train_data, num_boost_round=10) gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_importance(gbm0) ax0 = lgb.plot_importance(gbm0)
assert isinstance(ax0, matplotlib.axes.Axes) assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Feature importance' assert ax0.get_title() == "Feature importance"
assert ax0.get_xlabel() == 'Feature importance' assert ax0.get_xlabel() == "Feature importance"
assert ax0.get_ylabel() == 'Features' assert ax0.get_ylabel() == "Features"
assert len(ax0.patches) <= 30 assert len(ax0.patches) <= 30
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
gbm1.fit(X_train, y_train) gbm1.fit(X_train, y_train)
ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y') ax1 = lgb.plot_importance(gbm1, color="r", title="t", xlabel="x", ylabel="y")
assert isinstance(ax1, matplotlib.axes.Axes) assert isinstance(ax1, matplotlib.axes.Axes)
assert ax1.get_title() == 't' assert ax1.get_title() == "t"
assert ax1.get_xlabel() == 'x' assert ax1.get_xlabel() == "x"
assert ax1.get_ylabel() == 'y' assert ax1.get_ylabel() == "y"
assert len(ax1.patches) <= 30 assert len(ax1.patches) <= 30
for patch in ax1.patches: for patch in ax1.patches:
assert patch.get_facecolor() == (1., 0, 0, 1.) # red assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None) ax2 = lgb.plot_importance(gbm0, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None)
assert isinstance(ax2, matplotlib.axes.Axes) assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == '' assert ax2.get_title() == ""
assert ax2.get_xlabel() == '' assert ax2.get_xlabel() == ""
assert ax2.get_ylabel() == '' assert ax2.get_ylabel() == ""
assert len(ax2.patches) <= 30 assert len(ax2.patches) <= 30
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
ax3 = lgb.plot_importance(gbm0, title='t @importance_type@', xlabel='x @importance_type@', ylabel='y @importance_type@') ax3 = lgb.plot_importance(
gbm0, title="t @importance_type@", xlabel="x @importance_type@", ylabel="y @importance_type@"
)
assert isinstance(ax3, matplotlib.axes.Axes) assert isinstance(ax3, matplotlib.axes.Axes)
assert ax3.get_title() == 't @importance_type@' assert ax3.get_title() == "t @importance_type@"
assert ax3.get_xlabel() == 'x split' assert ax3.get_xlabel() == "x split"
assert ax3.get_ylabel() == 'y @importance_type@' assert ax3.get_ylabel() == "y @importance_type@"
assert len(ax3.patches) <= 30 assert len(ax3.patches) <= 30
gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain") gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain")
...@@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data): ...@@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data):
assert first_bar1 != first_bar3 assert first_bar1 != first_bar3
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed') @pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_split_value_histogram(params, breast_cancer_split, train_data): def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
X_train, _, y_train, _ = breast_cancer_split X_train, _, y_train, _ = breast_cancer_split
gbm0 = lgb.train(params, train_data, num_boost_round=10) gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_split_value_histogram(gbm0, 27) ax0 = lgb.plot_split_value_histogram(gbm0, 27)
assert isinstance(ax0, matplotlib.axes.Axes) assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Split value histogram for feature with index 27' assert ax0.get_title() == "Split value histogram for feature with index 27"
assert ax0.get_xlabel() == 'Feature split value' assert ax0.get_xlabel() == "Feature split value"
assert ax0.get_ylabel() == 'Count' assert ax0.get_ylabel() == "Count"
assert len(ax0.patches) <= 2 assert len(ax0.patches) <= 2
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1) gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
gbm1.fit(X_train, y_train) gbm1.fit(X_train, y_train)
ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5), ax1 = lgb.plot_split_value_histogram(
title='Histogram for feature @index/name@ @feature@', gbm1,
xlabel='x', ylabel='y', color='r') gbm1.booster_.feature_name()[27],
figsize=(10, 5),
title="Histogram for feature @index/name@ @feature@",
xlabel="x",
ylabel="y",
color="r",
)
assert isinstance(ax1, matplotlib.axes.Axes) assert isinstance(ax1, matplotlib.axes.Axes)
title = f'Histogram for feature name {gbm1.booster_.feature_name()[27]}' title = f"Histogram for feature name {gbm1.booster_.feature_name()[27]}"
assert ax1.get_title() == title assert ax1.get_title() == title
assert ax1.get_xlabel() == 'x' assert ax1.get_xlabel() == "x"
assert ax1.get_ylabel() == 'y' assert ax1.get_ylabel() == "y"
assert len(ax1.patches) <= 2 assert len(ax1.patches) <= 2
for patch in ax1.patches: for patch in ax1.patches:
assert patch.get_facecolor() == (1., 0, 0, 1.) # red assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'], ax2 = lgb.plot_split_value_histogram(
title=None, xlabel=None, ylabel=None) gbm0, 27, bins=10, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None
)
assert isinstance(ax2, matplotlib.axes.Axes) assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == '' assert ax2.get_title() == ""
assert ax2.get_xlabel() == '' assert ax2.get_xlabel() == ""
assert ax2.get_ylabel() == '' assert ax2.get_ylabel() == ""
assert len(ax2.patches) == 10 assert len(ax2.patches) == 10
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
with pytest.raises(ValueError): with pytest.raises(ValueError):
lgb.plot_split_value_histogram(gbm0, 0) # was not used in splitting lgb.plot_split_value_histogram(gbm0, 0) # was not used in splitting
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED, @pytest.mark.skipif(
reason='matplotlib or graphviz is not installed') not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED, reason="matplotlib or graphviz is not installed"
)
def test_plot_tree(breast_cancer_split): def test_plot_tree(breast_cancer_split):
X_train, _, y_train, _ = breast_cancer_split X_train, _, y_train, _ = breast_cancer_split
gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1) gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
...@@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split): ...@@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split):
with pytest.raises(IndexError): with pytest.raises(IndexError):
lgb.plot_tree(gbm, tree_index=83) lgb.plot_tree(gbm, tree_index=83)
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain']) ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=["split_gain"])
assert isinstance(ax, matplotlib.axes.Axes) assert isinstance(ax, matplotlib.axes.Axes)
w, h = ax.axes.get_figure().get_size_inches() w, h = ax.axes.get_figure().get_size_inches()
assert int(w) == 15 assert int(w) == 15
assert int(h) == 8 assert int(h) == 8
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed') @pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_create_tree_digraph(breast_cancer_split): def test_create_tree_digraph(breast_cancer_split):
X_train, _, y_train, _ = breast_cancer_split X_train, _, y_train, _ = breast_cancer_split
...@@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split): ...@@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split):
with pytest.raises(IndexError): with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83) lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=3, graph = lgb.create_tree_digraph(
show_info=['split_gain', 'internal_value', 'internal_weight'], gbm,
name='Tree4', node_attr={'color': 'red'}) tree_index=3,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
)
graph.render(view=False) graph.render(view=False)
assert isinstance(graph, graphviz.Digraph) assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4' assert graph.name == "Tree4"
assert len(graph.node_attr) == 1 assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red' assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0 assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0 assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body) graph_body = "".join(graph.body)
assert 'leaf' in graph_body assert "leaf" in graph_body
assert 'gain' in graph_body assert "gain" in graph_body
assert 'value' in graph_body assert "value" in graph_body
assert 'weight' in graph_body assert "weight" in graph_body
assert '#ffdddd' in graph_body assert "#ffdddd" in graph_body
assert '#ddffdd' in graph_body assert "#ddffdd" in graph_body
assert 'data' not in graph_body assert "data" not in graph_body
assert 'count' not in graph_body assert "count" not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed') @pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_tree_with_categories_below_max_category_values(): def test_tree_with_categories_below_max_category_values():
X_train, y_train = _categorical_data(2, 10) X_train, y_train = _categorical_data(2, 10)
params = { params = {
...@@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values(): ...@@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values():
"deterministic": True, "deterministic": True,
"num_threads": 1, "num_threads": 1,
"seed": 708, "seed": 708,
"verbose": -1 "verbose": -1,
} }
gbm = lgb.LGBMClassifier(**params) gbm = lgb.LGBMClassifier(**params)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
...@@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values(): ...@@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values():
with pytest.raises(IndexError): with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83) lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=3, graph = lgb.create_tree_digraph(
show_info=['split_gain', 'internal_value', 'internal_weight'], gbm,
name='Tree4', node_attr={'color': 'red'}, tree_index=3,
max_category_values=10) show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
max_category_values=10,
)
graph.render(view=False) graph.render(view=False)
assert isinstance(graph, graphviz.Digraph) assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4' assert graph.name == "Tree4"
assert len(graph.node_attr) == 1 assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red' assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0 assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0 assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body) graph_body = "".join(graph.body)
assert 'leaf' in graph_body assert "leaf" in graph_body
assert 'gain' in graph_body assert "gain" in graph_body
assert 'value' in graph_body assert "value" in graph_body
assert 'weight' in graph_body assert "weight" in graph_body
assert 'data' not in graph_body assert "data" not in graph_body
assert 'count' not in graph_body assert "count" not in graph_body
assert '||...||' not in graph_body assert "||...||" not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed') @pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_tree_with_categories_above_max_category_values(): def test_tree_with_categories_above_max_category_values():
X_train, y_train = _categorical_data(20, 30) X_train, y_train = _categorical_data(20, 30)
params = { params = {
...@@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values(): ...@@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values():
"deterministic": True, "deterministic": True,
"num_threads": 1, "num_threads": 1,
"seed": 708, "seed": 708,
"verbose": -1 "verbose": -1,
} }
gbm = lgb.LGBMClassifier(**params) gbm = lgb.LGBMClassifier(**params)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
...@@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values(): ...@@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values():
with pytest.raises(IndexError): with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83) lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=9, graph = lgb.create_tree_digraph(
show_info=['split_gain', 'internal_value', 'internal_weight'], gbm,
name='Tree4', node_attr={'color': 'red'}, tree_index=9,
max_category_values=4) show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
max_category_values=4,
)
graph.render(view=False) graph.render(view=False)
assert isinstance(graph, graphviz.Digraph) assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4' assert graph.name == "Tree4"
assert len(graph.node_attr) == 1 assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red' assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0 assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0 assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body) graph_body = "".join(graph.body)
assert 'leaf' in graph_body assert "leaf" in graph_body
assert 'gain' in graph_body assert "gain" in graph_body
assert 'value' in graph_body assert "value" in graph_body
assert 'weight' in graph_body assert "weight" in graph_body
assert 'data' not in graph_body assert "data" not in graph_body
assert 'count' not in graph_body assert "count" not in graph_body
assert '||...||' in graph_body assert "||...||" in graph_body
@pytest.mark.parametrize('use_missing', [True, False]) @pytest.mark.parametrize("use_missing", [True, False])
@pytest.mark.parametrize('zero_as_missing', [True, False]) @pytest.mark.parametrize("zero_as_missing", [True, False])
def test_numeric_split_direction(use_missing, zero_as_missing): def test_numeric_split_direction(use_missing, zero_as_missing):
if use_missing and zero_as_missing: if use_missing and zero_as_missing:
pytest.skip('use_missing and zero_as_missing both set to True') pytest.skip("use_missing and zero_as_missing both set to True")
X, y = make_synthetic_regression() X, y = make_synthetic_regression()
rng = np.random.RandomState(0) rng = np.random.RandomState(0)
zero_mask = rng.rand(X.shape[0]) < 0.05 zero_mask = rng.rand(X.shape[0]) < 0.05
...@@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing): ...@@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing):
X[nan_mask, :] = np.nan X[nan_mask, :] = np.nan
ds = lgb.Dataset(X, y) ds = lgb.Dataset(X, y)
params = { params = {
'num_leaves': 127, "num_leaves": 127,
'min_child_samples': 1, "min_child_samples": 1,
'use_missing': use_missing, "use_missing": use_missing,
'zero_as_missing': zero_as_missing, "zero_as_missing": zero_as_missing,
} }
bst = lgb.train(params, ds, num_boost_round=1) bst = lgb.train(params, ds, num_boost_round=1)
case_with_zero = X[zero_mask][[0]] case_with_zero = X[zero_mask][[0]]
expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0] expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0]
node = bst.dump_model()['tree_info'][0]['tree_structure'] node = bst.dump_model()["tree_info"][0]["tree_structure"]
while 'decision_type' in node: while "decision_type" in node:
direction = lgb.plotting._determine_direction_for_numeric_split( direction = lgb.plotting._determine_direction_for_numeric_split(
case_with_zero[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left'] case_with_zero[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
) )
node = node['left_child'] if direction == 'left' else node['right_child'] node = node["left_child"] if direction == "left" else node["right_child"]
assert node['leaf_index'] == expected_leaf_zero assert node["leaf_index"] == expected_leaf_zero
if use_missing: if use_missing:
case_with_nan = X[nan_mask][[0]] case_with_nan = X[nan_mask][[0]]
expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0] expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0]
node = bst.dump_model()['tree_info'][0]['tree_structure'] node = bst.dump_model()["tree_info"][0]["tree_structure"]
while 'decision_type' in node: while "decision_type" in node:
direction = lgb.plotting._determine_direction_for_numeric_split( direction = lgb.plotting._determine_direction_for_numeric_split(
case_with_nan[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left'] case_with_nan[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
) )
node = node['left_child'] if direction == 'left' else node['right_child'] node = node["left_child"] if direction == "left" else node["right_child"]
assert node['leaf_index'] == expected_leaf_nan assert node["leaf_index"] == expected_leaf_nan
assert expected_leaf_zero != expected_leaf_nan assert expected_leaf_zero != expected_leaf_nan
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed') @pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_example_case_in_tree_digraph(): def test_example_case_in_tree_digraph():
rng = np.random.RandomState(0) rng = np.random.RandomState(0)
x1 = rng.rand(100) x1 = rng.rand(100)
cat = rng.randint(1, 3, size=x1.size) cat = rng.randint(1, 3, size=x1.size)
X = np.vstack([x1, cat]).T X = np.vstack([x1, cat]).T
y = x1 + 2 * cat y = x1 + 2 * cat
feature_name = ['x1', 'cat'] feature_name = ["x1", "cat"]
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=['cat']) ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=["cat"])
num_round = 3 num_round = 3
bst = lgb.train({'num_leaves': 7}, ds, num_boost_round=num_round) bst = lgb.train({"num_leaves": 7}, ds, num_boost_round=num_round)
mod = bst.dump_model() mod = bst.dump_model()
example_case = X[[0]] example_case = X[[0]]
makes_categorical_splits = False makes_categorical_splits = False
...@@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph(): ...@@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph():
for i in range(num_round): for i in range(num_round):
graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i) graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i)
gbody = graph.body gbody = graph.body
node = mod['tree_info'][i]['tree_structure'] node = mod["tree_info"][i]["tree_structure"]
while 'decision_type' in node: # iterate through the splits while "decision_type" in node: # iterate through the splits
split_index = node['split_index'] split_index = node["split_index"]
node_in_graph = [n for n in gbody if f'split{split_index}' in n and '->' not in n] node_in_graph = [n for n in gbody if f"split{split_index}" in n and "->" not in n]
assert len(node_in_graph) == 1 assert len(node_in_graph) == 1
seen_indices.add(gbody.index(node_in_graph[0])) seen_indices.add(gbody.index(node_in_graph[0]))
edge_to_node = [e for e in gbody if f'-> split{split_index}' in e] edge_to_node = [e for e in gbody if f"-> split{split_index}" in e]
if node['decision_type'] == '<=': if node["decision_type"] == "<=":
direction = lgb.plotting._determine_direction_for_numeric_split( direction = lgb.plotting._determine_direction_for_numeric_split(
example_case[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']) example_case[0][node["split_feature"]],
node["threshold"],
node["missing_type"],
node["default_left"],
)
else: else:
makes_categorical_splits = True makes_categorical_splits = True
direction = lgb.plotting._determine_direction_for_categorical_split( direction = lgb.plotting._determine_direction_for_categorical_split(
example_case[0][node['split_feature']], node['threshold'] example_case[0][node["split_feature"]], node["threshold"]
) )
node = node['left_child'] if direction == 'left' else node['right_child'] node = node["left_child"] if direction == "left" else node["right_child"]
assert 'color=blue' in node_in_graph[0] assert "color=blue" in node_in_graph[0]
if edge_to_node: if edge_to_node:
assert len(edge_to_node) == 1 assert len(edge_to_node) == 1
assert 'color=blue' in edge_to_node[0] assert "color=blue" in edge_to_node[0]
seen_indices.add(gbody.index(edge_to_node[0])) seen_indices.add(gbody.index(edge_to_node[0]))
# we're in a leaf now # we're in a leaf now
leaf_index = node['leaf_index'] leaf_index = node["leaf_index"]
leaf_in_graph = [n for n in gbody if f'leaf{leaf_index}' in n and '->' not in n] leaf_in_graph = [n for n in gbody if f"leaf{leaf_index}" in n and "->" not in n]
edge_to_leaf = [e for e in gbody if f'-> leaf{leaf_index}' in e] edge_to_leaf = [e for e in gbody if f"-> leaf{leaf_index}" in e]
assert len(leaf_in_graph) == 1 assert len(leaf_in_graph) == 1
assert 'color=blue' in leaf_in_graph[0] assert "color=blue" in leaf_in_graph[0]
assert len(edge_to_leaf) == 1 assert len(edge_to_leaf) == 1
assert 'color=blue' in edge_to_leaf[0] assert "color=blue" in edge_to_leaf[0]
seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])]) seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])])
# check that the rest of the elements have black color # check that the rest of the elements have black color
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and 'graph' not in e] remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and "graph" not in e]
assert all('color=black' in e for e in remaining_elements) assert all("color=black" in e for e in remaining_elements)
# check that we got to the expected leaf # check that we got to the expected leaf
expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0] expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0]
...@@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph(): ...@@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph():
assert makes_categorical_splits assert makes_categorical_splits
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed') @pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
@pytest.mark.parametrize('input_type', ['array', 'dataframe']) @pytest.mark.parametrize("input_type", ["array", "dataframe"])
def test_empty_example_case_on_tree_digraph_raises_error(input_type): def test_empty_example_case_on_tree_digraph_raises_error(input_type):
X, y = make_synthetic_regression() X, y = make_synthetic_regression()
if input_type == 'dataframe': if input_type == "dataframe":
if not PANDAS_INSTALLED: if not PANDAS_INSTALLED:
pytest.skip(reason='pandas is not installed') pytest.skip(reason="pandas is not installed")
X = pd_DataFrame(X) X = pd_DataFrame(X)
ds = lgb.Dataset(X, y) ds = lgb.Dataset(X, y)
bst = lgb.train({'num_leaves': 3}, ds, num_boost_round=1) bst = lgb.train({"num_leaves": 3}, ds, num_boost_round=1)
example_case = X[:0] example_case = X[:0]
if input_type == 'dataframe': if input_type == "dataframe":
example_case = pd_DataFrame(example_case) example_case = pd_DataFrame(example_case)
with pytest.raises(ValueError, match='example_case must have a single row.'): with pytest.raises(ValueError, match="example_case must have a single row."):
lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case) lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case)
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed') @pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_metrics(params, breast_cancer_split, train_data): def test_plot_metrics(params, breast_cancer_split, train_data):
X_train, X_test, y_train, y_test = breast_cancer_split X_train, X_test, y_train, y_test = breast_cancer_split
test_data = lgb.Dataset(X_test, y_test, reference=train_data) test_data = lgb.Dataset(X_test, y_test, reference=train_data)
params.update({"metric": {"binary_logloss", "binary_error"}}) params.update({"metric": {"binary_logloss", "binary_error"}})
evals_result0 = {} evals_result0 = {}
lgb.train(params, train_data, lgb.train(
valid_sets=[train_data, test_data], params,
valid_names=['v1', 'v2'], train_data,
num_boost_round=10, valid_sets=[train_data, test_data],
callbacks=[lgb.record_evaluation(evals_result0)]) valid_names=["v1", "v2"],
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result0)],
)
with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."): with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
ax0 = lgb.plot_metric(evals_result0) ax0 = lgb.plot_metric(evals_result0)
assert isinstance(ax0, matplotlib.axes.Axes) assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Metric during training' assert ax0.get_title() == "Metric during training"
assert ax0.get_xlabel() == 'Iterations' assert ax0.get_xlabel() == "Iterations"
assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'} assert ax0.get_ylabel() in {"binary_logloss", "binary_error"}
legend_items = ax0.get_legend().get_texts() legend_items = ax0.get_legend().get_texts()
assert len(legend_items) == 2 assert len(legend_items) == 2
assert legend_items[0].get_text() == 'v1' assert legend_items[0].get_text() == "v1"
assert legend_items[1].get_text() == 'v2' assert legend_items[1].get_text() == "v2"
ax1 = lgb.plot_metric(evals_result0, metric='binary_error') ax1 = lgb.plot_metric(evals_result0, metric="binary_error")
assert isinstance(ax1, matplotlib.axes.Axes) assert isinstance(ax1, matplotlib.axes.Axes)
assert ax1.get_title() == 'Metric during training' assert ax1.get_title() == "Metric during training"
assert ax1.get_xlabel() == 'Iterations' assert ax1.get_xlabel() == "Iterations"
assert ax1.get_ylabel() == 'binary_error' assert ax1.get_ylabel() == "binary_error"
legend_items = ax1.get_legend().get_texts() legend_items = ax1.get_legend().get_texts()
assert len(legend_items) == 2 assert len(legend_items) == 2
assert legend_items[0].get_text() == 'v1' assert legend_items[0].get_text() == "v1"
assert legend_items[1].get_text() == 'v2' assert legend_items[1].get_text() == "v2"
ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2']) ax2 = lgb.plot_metric(evals_result0, metric="binary_logloss", dataset_names=["v2"])
assert isinstance(ax2, matplotlib.axes.Axes) assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == 'Metric during training' assert ax2.get_title() == "Metric during training"
assert ax2.get_xlabel() == 'Iterations' assert ax2.get_xlabel() == "Iterations"
assert ax2.get_ylabel() == 'binary_logloss' assert ax2.get_ylabel() == "binary_logloss"
legend_items = ax2.get_legend().get_texts() legend_items = ax2.get_legend().get_texts()
assert len(legend_items) == 1 assert len(legend_items) == 1
assert legend_items[0].get_text() == 'v2' assert legend_items[0].get_text() == "v2"
ax3 = lgb.plot_metric( ax3 = lgb.plot_metric(
evals_result0, evals_result0,
metric='binary_logloss', metric="binary_logloss",
dataset_names=['v1'], dataset_names=["v1"],
title='Metric @metric@', title="Metric @metric@",
xlabel='Iterations @metric@', xlabel="Iterations @metric@",
ylabel='Value of "@metric@"', ylabel='Value of "@metric@"',
figsize=(5, 5), figsize=(5, 5),
dpi=600, dpi=600,
grid=False grid=False,
) )
assert isinstance(ax3, matplotlib.axes.Axes) assert isinstance(ax3, matplotlib.axes.Axes)
assert ax3.get_title() == 'Metric @metric@' assert ax3.get_title() == "Metric @metric@"
assert ax3.get_xlabel() == 'Iterations @metric@' assert ax3.get_xlabel() == "Iterations @metric@"
assert ax3.get_ylabel() == 'Value of "binary_logloss"' assert ax3.get_ylabel() == 'Value of "binary_logloss"'
legend_items = ax3.get_legend().get_texts() legend_items = ax3.get_legend().get_texts()
assert len(legend_items) == 1 assert len(legend_items) == 1
assert legend_items[0].get_text() == 'v1' assert legend_items[0].get_text() == "v1"
assert ax3.get_figure().get_figheight() == 5 assert ax3.get_figure().get_figheight() == 5
assert ax3.get_figure().get_figwidth() == 5 assert ax3.get_figure().get_figwidth() == 5
assert ax3.get_figure().get_dpi() == 600 assert ax3.get_figure().get_dpi() == 600
...@@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data): ...@@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
assert not grid_line.get_visible() assert not grid_line.get_visible()
evals_result1 = {} evals_result1 = {}
lgb.train(params, train_data, lgb.train(params, train_data, num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result1)])
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result1)])
with pytest.raises(ValueError, match="eval results cannot be empty."): with pytest.raises(ValueError, match="eval results cannot be empty."):
lgb.plot_metric(evals_result1) lgb.plot_metric(evals_result1)
...@@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data): ...@@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)]) gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None) ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
assert isinstance(ax4, matplotlib.axes.Axes) assert isinstance(ax4, matplotlib.axes.Axes)
assert ax4.get_title() == '' assert ax4.get_title() == ""
assert ax4.get_xlabel() == '' assert ax4.get_xlabel() == ""
assert ax4.get_ylabel() == '' assert ax4.get_ylabel() == ""
legend_items = ax4.get_legend().get_texts() legend_items = ax4.get_legend().get_texts()
assert len(legend_items) == 1 assert len(legend_items) == 1
assert legend_items[0].get_text() == 'valid_0' assert legend_items[0].get_text() == "valid_0"
...@@ -23,32 +23,40 @@ from sklearn.utils.validation import check_is_fitted ...@@ -23,32 +23,40 @@ from sklearn.utils.validation import check_is_fitted
import lightgbm as lgb import lightgbm as lgb
from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression, from .utils import (
sklearn_multiclass_custom_objective, softmax) load_breast_cancer,
load_digits,
load_iris,
load_linnerud,
make_ranking,
make_synthetic_regression,
sklearn_multiclass_custom_objective,
softmax,
)
decreasing_generator = itertools.count(0, -1) decreasing_generator = itertools.count(0, -1)
task_to_model_factory = { task_to_model_factory = {
'ranking': lgb.LGBMRanker, "ranking": lgb.LGBMRanker,
'binary-classification': lgb.LGBMClassifier, "binary-classification": lgb.LGBMClassifier,
'multiclass-classification': lgb.LGBMClassifier, "multiclass-classification": lgb.LGBMClassifier,
'regression': lgb.LGBMRegressor, "regression": lgb.LGBMRegressor,
} }
def _create_data(task, n_samples=100, n_features=4): def _create_data(task, n_samples=100, n_features=4):
if task == 'ranking': if task == "ranking":
X, y, g = make_ranking(n_features=4, n_samples=n_samples) X, y, g = make_ranking(n_features=4, n_samples=n_samples)
g = np.bincount(g) g = np.bincount(g)
elif task.endswith('classification'): elif task.endswith("classification"):
if task == 'binary-classification': if task == "binary-classification":
centers = 2 centers = 2
elif task == 'multiclass-classification': elif task == "multiclass-classification":
centers = 3 centers = 3
else: else:
ValueError(f"Unknown classification task '{task}'") ValueError(f"Unknown classification task '{task}'")
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42) X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42)
g = None g = None
elif task == 'regression': elif task == "regression":
X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features) X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
g = None g = None
return X, y, g return X, y, g
...@@ -70,7 +78,7 @@ def custom_asymmetric_obj(y_true, y_pred): ...@@ -70,7 +78,7 @@ def custom_asymmetric_obj(y_true, y_pred):
def objective_ls(y_true, y_pred): def objective_ls(y_true, y_pred):
grad = (y_pred - y_true) grad = y_pred - y_true
hess = np.ones(len(y_true)) hess = np.ones(len(y_true))
return grad, hess return grad, hess
...@@ -87,15 +95,15 @@ def custom_dummy_obj(y_true, y_pred): ...@@ -87,15 +95,15 @@ def custom_dummy_obj(y_true, y_pred):
def constant_metric(y_true, y_pred): def constant_metric(y_true, y_pred):
return 'error', 0, False return "error", 0, False
def decreasing_metric(y_true, y_pred): def decreasing_metric(y_true, y_pred):
return ('decreasing_metric', next(decreasing_generator), False) return ("decreasing_metric", next(decreasing_generator), False)
def mse(y_true, y_pred): def mse(y_true, y_pred):
return 'custom MSE', mean_squared_error(y_true, y_pred), False return "custom MSE", mean_squared_error(y_true, y_pred), False
def binary_error(y_true, y_pred): def binary_error(y_true, y_pred):
...@@ -117,7 +125,7 @@ def test_binary(): ...@@ -117,7 +125,7 @@ def test_binary():
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)]) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
ret = log_loss(y_test, gbm.predict_proba(X_test)) ret = log_loss(y_test, gbm.predict_proba(X_test))
assert ret < 0.12 assert ret < 0.12
assert gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret) assert gbm.evals_result_["valid_0"]["binary_logloss"][gbm.best_iteration_ - 1] == pytest.approx(ret)
def test_regression(): def test_regression():
...@@ -127,10 +135,12 @@ def test_regression(): ...@@ -127,10 +135,12 @@ def test_regression():
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)]) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
assert ret < 174 assert ret < 174
assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret) assert gbm.evals_result_["valid_0"]["l2"][gbm.best_iteration_ - 1] == pytest.approx(ret)
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') @pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_multiclass(): def test_multiclass():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
...@@ -140,16 +150,18 @@ def test_multiclass(): ...@@ -140,16 +150,18 @@ def test_multiclass():
assert ret < 0.05 assert ret < 0.05
ret = multi_logloss(y_test, gbm.predict_proba(X_test)) ret = multi_logloss(y_test, gbm.predict_proba(X_test))
assert ret < 0.16 assert ret < 0.16
assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret) assert gbm.evals_result_["valid_0"]["multi_logloss"][gbm.best_iteration_ - 1] == pytest.approx(ret)
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') @pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_lambdarank(): def test_lambdarank():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test')) X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test"))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query')) q_test = np.loadtxt(str(rank_example_dir / "rank.test.query"))
gbm = lgb.LGBMRanker(n_estimators=50) gbm = lgb.LGBMRanker(n_estimators=50)
gbm.fit( gbm.fit(
X_train, X_train,
...@@ -158,23 +170,20 @@ def test_lambdarank(): ...@@ -158,23 +170,20 @@ def test_lambdarank():
eval_set=[(X_test, y_test)], eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_group=[q_test],
eval_at=[1, 3], eval_at=[1, 3],
callbacks=[ callbacks=[lgb.early_stopping(10), lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))],
lgb.early_stopping(10),
lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
]
) )
assert gbm.best_iteration_ <= 24 assert gbm.best_iteration_ <= 24
assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674 assert gbm.best_score_["valid_0"]["ndcg@1"] > 0.5674
assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578 assert gbm.best_score_["valid_0"]["ndcg@3"] > 0.578
def test_xendcg(): def test_xendcg():
xendcg_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'xendcg' xendcg_example_dir = Path(__file__).absolute().parents[2] / "examples" / "xendcg"
X_train, y_train = load_svmlight_file(str(xendcg_example_dir / 'rank.train')) X_train, y_train = load_svmlight_file(str(xendcg_example_dir / "rank.train"))
X_test, y_test = load_svmlight_file(str(xendcg_example_dir / 'rank.test')) X_test, y_test = load_svmlight_file(str(xendcg_example_dir / "rank.test"))
q_train = np.loadtxt(str(xendcg_example_dir / 'rank.train.query')) q_train = np.loadtxt(str(xendcg_example_dir / "rank.train.query"))
q_test = np.loadtxt(str(xendcg_example_dir / 'rank.test.query')) q_test = np.loadtxt(str(xendcg_example_dir / "rank.test.query"))
gbm = lgb.LGBMRanker(n_estimators=50, objective='rank_xendcg', random_state=5, n_jobs=1) gbm = lgb.LGBMRanker(n_estimators=50, objective="rank_xendcg", random_state=5, n_jobs=1)
gbm.fit( gbm.fit(
X_train, X_train,
y_train, y_train,
...@@ -182,28 +191,25 @@ def test_xendcg(): ...@@ -182,28 +191,25 @@ def test_xendcg():
eval_set=[(X_test, y_test)], eval_set=[(X_test, y_test)],
eval_group=[q_test], eval_group=[q_test],
eval_at=[1, 3], eval_at=[1, 3],
eval_metric='ndcg', eval_metric="ndcg",
callbacks=[ callbacks=[lgb.early_stopping(10), lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))],
lgb.early_stopping(10),
lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
]
) )
assert gbm.best_iteration_ <= 24 assert gbm.best_iteration_ <= 24
assert gbm.best_score_['valid_0']['ndcg@1'] > 0.6211 assert gbm.best_score_["valid_0"]["ndcg@1"] > 0.6211
assert gbm.best_score_['valid_0']['ndcg@3'] > 0.6253 assert gbm.best_score_["valid_0"]["ndcg@3"] > 0.6253
def test_eval_at_aliases(): def test_eval_at_aliases():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test')) X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test"))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query')) q_test = np.loadtxt(str(rank_example_dir / "rank.test.query"))
for alias in lgb.basic._ConfigAliases.get('eval_at'): for alias in lgb.basic._ConfigAliases.get("eval_at"):
gbm = lgb.LGBMRanker(n_estimators=5, **{alias: [1, 2, 3, 9]}) gbm = lgb.LGBMRanker(n_estimators=5, **{alias: [1, 2, 3, 9]})
with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'eval_at' argument"): with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'eval_at' argument"):
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test]) gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test])
assert list(gbm.evals_result_['valid_0'].keys()) == ['ndcg@1', 'ndcg@2', 'ndcg@3', 'ndcg@9'] assert list(gbm.evals_result_["valid_0"].keys()) == ["ndcg@1", "ndcg@2", "ndcg@3", "ndcg@9"]
@pytest.mark.parametrize("custom_objective", [True, False]) @pytest.mark.parametrize("custom_objective", [True, False])
...@@ -212,20 +218,22 @@ def test_objective_aliases(custom_objective): ...@@ -212,20 +218,22 @@ def test_objective_aliases(custom_objective):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
if custom_objective: if custom_objective:
obj = custom_dummy_obj obj = custom_dummy_obj
metric_name = 'l2' # default one metric_name = "l2" # default one
else: else:
obj = 'mape' obj = "mape"
metric_name = 'mape' metric_name = "mape"
evals = [] evals = []
for alias in lgb.basic._ConfigAliases.get('objective'): for alias in lgb.basic._ConfigAliases.get("objective"):
gbm = lgb.LGBMRegressor(n_estimators=5, **{alias: obj}) gbm = lgb.LGBMRegressor(n_estimators=5, **{alias: obj})
if alias != 'objective': if alias != "objective":
with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'objective' argument"): with pytest.warns(
UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'objective' argument"
):
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)]) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
else: else:
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)]) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
assert list(gbm.evals_result_['valid_0'].keys()) == [metric_name] assert list(gbm.evals_result_["valid_0"].keys()) == [metric_name]
evals.append(gbm.evals_result_['valid_0'][metric_name]) evals.append(gbm.evals_result_["valid_0"][metric_name])
evals_t = np.array(evals).T evals_t = np.array(evals).T
for i in range(evals_t.shape[0]): for i in range(evals_t.shape[0]):
np.testing.assert_allclose(evals_t[i], evals_t[i][0]) np.testing.assert_allclose(evals_t[i], evals_t[i][0])
...@@ -241,7 +249,7 @@ def test_regression_with_custom_objective(): ...@@ -241,7 +249,7 @@ def test_regression_with_custom_objective():
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)]) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
ret = mean_squared_error(y_test, gbm.predict(X_test)) ret = mean_squared_error(y_test, gbm.predict(X_test))
assert ret < 174 assert ret < 174
assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret) assert gbm.evals_result_["valid_0"]["l2"][gbm.best_iteration_ - 1] == pytest.approx(ret)
def test_binary_classification_with_custom_objective(): def test_binary_classification_with_custom_objective():
...@@ -260,7 +268,7 @@ def test_binary_classification_with_custom_objective(): ...@@ -260,7 +268,7 @@ def test_binary_classification_with_custom_objective():
def test_dart(): def test_dart():
X, y = make_synthetic_regression() X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50) gbm = lgb.LGBMRegressor(boosting_type="dart", n_estimators=50)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
score = gbm.score(X_test, y_test) score = gbm.score(X_test, y_test)
assert 0.8 <= score <= 1.0 assert 0.8 <= score <= 1.0
...@@ -269,22 +277,21 @@ def test_dart(): ...@@ -269,22 +277,21 @@ def test_dart():
def test_stacking_classifier(): def test_stacking_classifier():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)), classifiers = [("gbm1", lgb.LGBMClassifier(n_estimators=3)), ("gbm2", lgb.LGBMClassifier(n_estimators=3))]
('gbm2', lgb.LGBMClassifier(n_estimators=3))] clf = StackingClassifier(
clf = StackingClassifier(estimators=classifiers, estimators=classifiers, final_estimator=lgb.LGBMClassifier(n_estimators=3), passthrough=True
final_estimator=lgb.LGBMClassifier(n_estimators=3), )
passthrough=True)
clf.fit(X_train, y_train) clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) score = clf.score(X_test, y_test)
assert score >= 0.8 assert score >= 0.8
assert score <= 1. assert score <= 1.0
assert clf.n_features_in_ == 4 # number of input features assert clf.n_features_in_ == 4 # number of input features
assert len(clf.named_estimators_['gbm1'].feature_importances_) == 4 assert len(clf.named_estimators_["gbm1"].feature_importances_) == 4
assert clf.named_estimators_['gbm1'].n_features_in_ == clf.named_estimators_['gbm2'].n_features_in_ assert clf.named_estimators_["gbm1"].n_features_in_ == clf.named_estimators_["gbm2"].n_features_in_
assert clf.final_estimator_.n_features_in_ == 10 # number of concatenated features assert clf.final_estimator_.n_features_in_ == 10 # number of concatenated features
assert len(clf.final_estimator_.feature_importances_) == 10 assert len(clf.final_estimator_.feature_importances_) == 10
assert all(clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_) assert all(clf.named_estimators_["gbm1"].classes_ == clf.named_estimators_["gbm2"].classes_)
assert all(clf.classes_ == clf.named_estimators_['gbm1'].classes_) assert all(clf.classes_ == clf.named_estimators_["gbm1"].classes_)
def test_stacking_regressor(): def test_stacking_regressor():
...@@ -292,18 +299,15 @@ def test_stacking_regressor(): ...@@ -292,18 +299,15 @@ def test_stacking_regressor():
n_features = X.shape[1] n_features = X.shape[1]
n_input_models = 2 n_input_models = 2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)), regressors = [("gbm1", lgb.LGBMRegressor(n_estimators=3)), ("gbm2", lgb.LGBMRegressor(n_estimators=3))]
('gbm2', lgb.LGBMRegressor(n_estimators=3))] reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor(n_estimators=3), passthrough=True)
reg = StackingRegressor(estimators=regressors,
final_estimator=lgb.LGBMRegressor(n_estimators=3),
passthrough=True)
reg.fit(X_train, y_train) reg.fit(X_train, y_train)
score = reg.score(X_test, y_test) score = reg.score(X_test, y_test)
assert score >= 0.2 assert score >= 0.2
assert score <= 1. assert score <= 1.0
assert reg.n_features_in_ == n_features # number of input features assert reg.n_features_in_ == n_features # number of input features
assert len(reg.named_estimators_['gbm1'].feature_importances_) == n_features assert len(reg.named_estimators_["gbm1"].feature_importances_) == n_features
assert reg.named_estimators_['gbm1'].n_features_in_ == reg.named_estimators_['gbm2'].n_features_in_ assert reg.named_estimators_["gbm1"].n_features_in_ == reg.named_estimators_["gbm2"].n_features_in_
assert reg.final_estimator_.n_features_in_ == n_features + n_input_models # number of concatenated features assert reg.final_estimator_.n_features_in_ == n_features + n_input_models # number of concatenated features
assert len(reg.final_estimator_.feature_importances_) == n_features + n_input_models assert len(reg.final_estimator_.feature_importances_) == n_features + n_input_models
...@@ -313,91 +317,69 @@ def test_grid_search(): ...@@ -313,91 +317,69 @@ def test_grid_search():
y = y.astype(str) # utilize label encoder at it's max power y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
params = { params = {"subsample": 0.8, "subsample_freq": 1}
"subsample": 0.8, grid_params = {"boosting_type": ["rf", "gbdt"], "n_estimators": [4, 6], "reg_alpha": [0.01, 0.005]}
"subsample_freq": 1
}
grid_params = {
"boosting_type": ['rf', 'gbdt'],
"n_estimators": [4, 6],
"reg_alpha": [0.01, 0.005]
}
evals_result = {} evals_result = {}
fit_params = { fit_params = {
"eval_set": [(X_val, y_val)], "eval_set": [(X_val, y_val)],
"eval_metric": constant_metric, "eval_metric": constant_metric,
"callbacks": [ "callbacks": [lgb.early_stopping(2), lgb.record_evaluation(evals_result)],
lgb.early_stopping(2),
lgb.record_evaluation(evals_result)
]
} }
grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2) grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
grid.fit(X_train, y_train, **fit_params) grid.fit(X_train, y_train, **fit_params)
score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True
assert grid.best_params_['boosting_type'] in ['rf', 'gbdt'] assert grid.best_params_["boosting_type"] in ["rf", "gbdt"]
assert grid.best_params_['n_estimators'] in [4, 6] assert grid.best_params_["n_estimators"] in [4, 6]
assert grid.best_params_['reg_alpha'] in [0.01, 0.005] assert grid.best_params_["reg_alpha"] in [0.01, 0.005]
assert grid.best_score_ <= 1. assert grid.best_score_ <= 1.0
assert grid.best_estimator_.best_iteration_ == 1 assert grid.best_estimator_.best_iteration_ == 1
assert grid.best_estimator_.best_score_['valid_0']['multi_logloss'] < 0.25 assert grid.best_estimator_.best_score_["valid_0"]["multi_logloss"] < 0.25
assert grid.best_estimator_.best_score_['valid_0']['error'] == 0 assert grid.best_estimator_.best_score_["valid_0"]["error"] == 0
assert score >= 0.2 assert score >= 0.2
assert score <= 1. assert score <= 1.0
assert evals_result == grid.best_estimator_.evals_result_ assert evals_result == grid.best_estimator_.evals_result_
def test_random_search(): def test_random_search():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
y = y.astype(str) # utilize label encoder at it's max power y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
random_state=42)
n_iter = 3 # Number of samples n_iter = 3 # Number of samples
params = { params = {"subsample": 0.8, "subsample_freq": 1}
"subsample": 0.8,
"subsample_freq": 1
}
param_dist = { param_dist = {
"boosting_type": ['rf', 'gbdt'], "boosting_type": ["rf", "gbdt"],
"n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)], "n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
"reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)] "reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)],
}
fit_params = {
"eval_set": [(X_val, y_val)],
"eval_metric": constant_metric,
"callbacks": [lgb.early_stopping(2)]
} }
rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params), fit_params = {"eval_set": [(X_val, y_val)], "eval_metric": constant_metric, "callbacks": [lgb.early_stopping(2)]}
param_distributions=param_dist, cv=2, rand = RandomizedSearchCV(
n_iter=n_iter, random_state=42) estimator=lgb.LGBMClassifier(**params), param_distributions=param_dist, cv=2, n_iter=n_iter, random_state=42
)
rand.fit(X_train, y_train, **fit_params) rand.fit(X_train, y_train, **fit_params)
score = rand.score(X_test, y_test) # utilizes RandomizedSearchCV default refit=True score = rand.score(X_test, y_test) # utilizes RandomizedSearchCV default refit=True
assert rand.best_params_['boosting_type'] in ['rf', 'gbdt'] assert rand.best_params_["boosting_type"] in ["rf", "gbdt"]
assert rand.best_params_['n_estimators'] in list(range(3, 10)) assert rand.best_params_["n_estimators"] in list(range(3, 10))
assert rand.best_params_['reg_alpha'] >= 0.01 # Left-closed boundary point assert rand.best_params_["reg_alpha"] >= 0.01 # Left-closed boundary point
assert rand.best_params_['reg_alpha'] <= 0.06 # Right-closed boundary point assert rand.best_params_["reg_alpha"] <= 0.06 # Right-closed boundary point
assert rand.best_score_ <= 1. assert rand.best_score_ <= 1.0
assert rand.best_estimator_.best_score_['valid_0']['multi_logloss'] < 0.25 assert rand.best_estimator_.best_score_["valid_0"]["multi_logloss"] < 0.25
assert rand.best_estimator_.best_score_['valid_0']['error'] == 0 assert rand.best_estimator_.best_score_["valid_0"]["error"] == 0
assert score >= 0.2 assert score >= 0.2
assert score <= 1. assert score <= 1.0
def test_multioutput_classifier(): def test_multioutput_classifier():
n_outputs = 3 n_outputs = 3
X, y = make_multilabel_classification(n_samples=100, n_features=20, X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=n_outputs, random_state=0)
n_classes=n_outputs, random_state=0)
y = y.astype(str) # utilize label encoder at it's max power y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
random_state=42)
clf = MultiOutputClassifier(estimator=lgb.LGBMClassifier(n_estimators=10)) clf = MultiOutputClassifier(estimator=lgb.LGBMClassifier(n_estimators=10))
clf.fit(X_train, y_train) clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) score = clf.score(X_test, y_test)
assert score >= 0.2 assert score >= 0.2
assert score <= 1. assert score <= 1.0
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
np.concatenate(clf.classes_))
for classifier in clf.estimators_: for classifier in clf.estimators_:
assert isinstance(classifier, lgb.LGBMClassifier) assert isinstance(classifier, lgb.LGBMClassifier)
assert isinstance(classifier.booster_, lgb.Booster) assert isinstance(classifier.booster_, lgb.Booster)
...@@ -405,15 +387,14 @@ def test_multioutput_classifier(): ...@@ -405,15 +387,14 @@ def test_multioutput_classifier():
def test_multioutput_regressor(): def test_multioutput_regressor():
bunch = load_linnerud(as_frame=True) # returns a Bunch instance bunch = load_linnerud(as_frame=True) # returns a Bunch instance
X, y = bunch['data'], bunch['target'] X, y = bunch["data"], bunch["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
random_state=42)
reg = MultiOutputRegressor(estimator=lgb.LGBMRegressor(n_estimators=10)) reg = MultiOutputRegressor(estimator=lgb.LGBMRegressor(n_estimators=10))
reg.fit(X_train, y_train) reg.fit(X_train, y_train)
y_pred = reg.predict(X_test) y_pred = reg.predict(X_test)
_, score, _ = mse(y_test, y_pred) _, score, _ = mse(y_test, y_pred)
assert score >= 0.2 assert score >= 0.2
assert score <= 120. assert score <= 120.0
for regressor in reg.estimators_: for regressor in reg.estimators_:
assert isinstance(regressor, lgb.LGBMRegressor) assert isinstance(regressor, lgb.LGBMRegressor)
assert isinstance(regressor.booster_, lgb.Booster) assert isinstance(regressor.booster_, lgb.Booster)
...@@ -421,19 +402,15 @@ def test_multioutput_regressor(): ...@@ -421,19 +402,15 @@ def test_multioutput_regressor():
def test_classifier_chain(): def test_classifier_chain():
n_outputs = 3 n_outputs = 3
X, y = make_multilabel_classification(n_samples=100, n_features=20, X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=n_outputs, random_state=0)
n_classes=n_outputs, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=42)
order = [2, 0, 1] order = [2, 0, 1]
clf = ClassifierChain(base_estimator=lgb.LGBMClassifier(n_estimators=10), clf = ClassifierChain(base_estimator=lgb.LGBMClassifier(n_estimators=10), order=order, random_state=42)
order=order, random_state=42)
clf.fit(X_train, y_train) clf.fit(X_train, y_train)
score = clf.score(X_test, y_test) score = clf.score(X_test, y_test)
assert score >= 0.2 assert score >= 0.2
assert score <= 1. assert score <= 1.0
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
np.concatenate(clf.classes_))
assert order == clf.order_ assert order == clf.order_
for classifier in clf.estimators_: for classifier in clf.estimators_:
assert isinstance(classifier, lgb.LGBMClassifier) assert isinstance(classifier, lgb.LGBMClassifier)
...@@ -442,16 +419,15 @@ def test_classifier_chain(): ...@@ -442,16 +419,15 @@ def test_classifier_chain():
def test_regressor_chain(): def test_regressor_chain():
bunch = load_linnerud(as_frame=True) # returns a Bunch instance bunch = load_linnerud(as_frame=True) # returns a Bunch instance
X, y = bunch['data'], bunch['target'] X, y = bunch["data"], bunch["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
order = [2, 0, 1] order = [2, 0, 1]
reg = RegressorChain(base_estimator=lgb.LGBMRegressor(n_estimators=10), order=order, reg = RegressorChain(base_estimator=lgb.LGBMRegressor(n_estimators=10), order=order, random_state=42)
random_state=42)
reg.fit(X_train, y_train) reg.fit(X_train, y_train)
y_pred = reg.predict(X_test) y_pred = reg.predict(X_test)
_, score, _ = mse(y_test, y_pred) _, score, _ = mse(y_test, y_pred)
assert score >= 0.2 assert score >= 0.2
assert score <= 120. assert score <= 120.0
assert order == reg.order_ assert order == reg.order_
for regressor in reg.estimators_: for regressor in reg.estimators_:
assert isinstance(regressor, lgb.LGBMRegressor) assert isinstance(regressor, lgb.LGBMRegressor)
...@@ -489,24 +465,17 @@ def test_clone_and_property(): ...@@ -489,24 +465,17 @@ def test_clone_and_property():
def test_joblib(): def test_joblib():
X, y = make_synthetic_regression() X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, verbose=-1, importance_type="split")
verbose=-1, importance_type='split')
gbm.fit( gbm.fit(
X_train, X_train,
y_train, y_train,
eval_set=[ eval_set=[(X_train, y_train), (X_test, y_test)],
(X_train, y_train),
(X_test, y_test)
],
eval_metric=mse, eval_metric=mse,
callbacks=[ callbacks=[lgb.early_stopping(5), lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))],
lgb.early_stopping(5),
lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))
]
) )
joblib.dump(gbm, 'lgb.pkl') # test model with custom functions joblib.dump(gbm, "lgb.pkl") # test model with custom functions
gbm_pickle = joblib.load('lgb.pkl') gbm_pickle = joblib.load("lgb.pkl")
assert isinstance(gbm_pickle.booster_, lgb.Booster) assert isinstance(gbm_pickle.booster_, lgb.Booster)
assert gbm.get_params() == gbm_pickle.get_params() assert gbm.get_params() == gbm_pickle.get_params()
np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_) np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
...@@ -515,8 +484,7 @@ def test_joblib(): ...@@ -515,8 +484,7 @@ def test_joblib():
for eval_set in gbm.evals_result_: for eval_set in gbm.evals_result_:
for metric in gbm.evals_result_[eval_set]: for metric in gbm.evals_result_[eval_set]:
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], gbm_pickle.evals_result_[eval_set][metric])
gbm_pickle.evals_result_[eval_set][metric])
pred_origin = gbm.predict(X_test) pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test) pred_pickle = gbm_pickle.predict(X_test)
np.testing.assert_allclose(pred_origin, pred_pickle) np.testing.assert_allclose(pred_origin, pred_pickle)
...@@ -526,7 +494,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path): ...@@ -526,7 +494,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
unpicklable_callback = UnpicklableCallback() unpicklable_callback = UnpicklableCallback()
with pytest.raises(Exception, match="This class in not picklable"): with pytest.raises(Exception, match="This class in not picklable"):
joblib.dump(unpicklable_callback, tmp_path / 'tmp.joblib') joblib.dump(unpicklable_callback, tmp_path / "tmp.joblib")
X, y = make_synthetic_regression() X, y = make_synthetic_regression()
gbm = lgb.LGBMRegressor(n_estimators=5) gbm = lgb.LGBMRegressor(n_estimators=5)
...@@ -578,9 +546,9 @@ def test_feature_importances_type(): ...@@ -578,9 +546,9 @@ def test_feature_importances_type():
data = load_iris(return_X_y=False) data = load_iris(return_X_y=False)
clf = lgb.LGBMClassifier(n_estimators=10) clf = lgb.LGBMClassifier(n_estimators=10)
clf.fit(data.data, data.target) clf.fit(data.data, data.target)
clf.set_params(importance_type='split') clf.set_params(importance_type="split")
importances_split = clf.feature_importances_ importances_split = clf.feature_importances_
clf.set_params(importance_type='gain') clf.set_params(importance_type="gain")
importances_gain = clf.feature_importances_ importances_gain = clf.feature_importances_
# Test that the largest element is NOT the same, the smallest can be the same, i.e. zero # Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
importance_split_top1 = sorted(importances_split, reverse=True)[0] importance_split_top1 = sorted(importances_split, reverse=True)[0]
...@@ -591,38 +559,44 @@ def test_feature_importances_type(): ...@@ -591,38 +559,44 @@ def test_feature_importances_type():
def test_pandas_categorical(): def test_pandas_categorical():
pd = pytest.importorskip("pandas") pd = pytest.importorskip("pandas")
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str X = pd.DataFrame(
"B": np.random.permutation([1, 2, 3] * 100), # int {
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float "A": np.random.permutation(["a", "b", "c", "d"] * 75), # str
"D": np.random.permutation([True, False] * 150), # bool "B": np.random.permutation([1, 2, 3] * 100), # int
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60), "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
ordered=True)}) # str and ordered categorical "D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
}
) # str and ordered categorical
y = np.random.permutation([0, 1] * 150) y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category X_test = pd.DataFrame(
"B": np.random.permutation([1, 3] * 30), {
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), "A": np.random.permutation(["a", "b", "e"] * 20), # unseen category
"D": np.random.permutation([True, False] * 30), "B": np.random.permutation([1, 3] * 30),
"E": pd.Categorical(np.random.permutation(['z', 'y'] * 30), "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
ordered=True)}) "D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(np.random.permutation(["z", "y"] * 30), ordered=True),
}
)
np.random.seed() # reset seed np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"] cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"] cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category') X[cat_cols_actual] = X[cat_cols_actual].astype("category")
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category') X_test[cat_cols_actual] = X_test[cat_cols_actual].astype("category")
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store] cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y) gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred0 = gbm0.predict(X_test, raw_score=True) pred0 = gbm0.predict(X_test, raw_score=True)
pred_prob = gbm0.predict_proba(X_test)[:, 1] pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0]) gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test, raw_score=True) pred1 = gbm1.predict(X_test, raw_score=True)
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A']) gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A"])
pred2 = gbm2.predict(X_test, raw_score=True) pred2 = gbm2.predict(X_test, raw_score=True)
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D']) gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A", "B", "C", "D"])
pred3 = gbm3.predict(X_test, raw_score=True) pred3 = gbm3.predict(X_test, raw_score=True)
gbm3.booster_.save_model('categorical.model') gbm3.booster_.save_model("categorical.model")
gbm4 = lgb.Booster(model_file='categorical.model') gbm4 = lgb.Booster(model_file="categorical.model")
pred4 = gbm4.predict(X_test) pred4 = gbm4.predict(X_test)
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E']) gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A", "B", "C", "D", "E"])
pred5 = gbm5.predict(X_test, raw_score=True) pred5 = gbm5.predict(X_test, raw_score=True)
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[]) gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
pred6 = gbm6.predict(X_test, raw_score=True) pred6 = gbm6.predict(X_test, raw_score=True)
...@@ -648,18 +622,26 @@ def test_pandas_categorical(): ...@@ -648,18 +622,26 @@ def test_pandas_categorical():
def test_pandas_sparse(): def test_pandas_sparse():
pd = pytest.importorskip("pandas") pd = pytest.importorskip("pandas")
X = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)), X = pd.DataFrame(
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)), {
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150))}) "A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150)),
}
)
y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150))) y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)), X_test = pd.DataFrame(
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)), {
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30))}) "A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30)),
}
)
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]): for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
assert pd.api.types.is_sparse(dtype) assert pd.api.types.is_sparse(dtype)
gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y) gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True) pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'): if hasattr(X_test, "sparse"):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True) pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else: else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True) pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
...@@ -669,13 +651,9 @@ def test_pandas_sparse(): ...@@ -669,13 +651,9 @@ def test_pandas_sparse():
def test_predict(): def test_predict():
# With default params # With default params
iris = load_iris(return_X_y=False) iris = load_iris(return_X_y=False)
X_train, X_test, y_train, _ = train_test_split(iris.data, iris.target, X_train, X_test, y_train, _ = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
test_size=0.2, random_state=42)
gbm = lgb.train({'objective': 'multiclass', gbm = lgb.train({"objective": "multiclass", "num_class": 3, "verbose": -1}, lgb.Dataset(X_train, y_train))
'num_class': 3,
'verbose': -1},
lgb.Dataset(X_train, y_train))
clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train) clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train)
# Tests same probabilities # Tests same probabilities
...@@ -705,9 +683,7 @@ def test_predict(): ...@@ -705,9 +683,7 @@ def test_predict():
# Tests other parameters for the prediction works # Tests other parameters for the prediction works
res_engine = gbm.predict(X_test) res_engine = gbm.predict(X_test)
res_sklearn_params = clf.predict_proba(X_test, res_sklearn_params = clf.predict_proba(X_test, pred_early_stop=True, pred_early_stop_margin=1.0)
pred_early_stop=True,
pred_early_stop_margin=1.0)
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
np.testing.assert_allclose(res_engine, res_sklearn_params) np.testing.assert_allclose(res_engine, res_sklearn_params)
...@@ -739,9 +715,7 @@ def test_predict(): ...@@ -739,9 +715,7 @@ def test_predict():
# Tests other parameters for the prediction works, starting from iteration 10 # Tests other parameters for the prediction works, starting from iteration 10
res_engine = gbm.predict(X_test, start_iteration=10) res_engine = gbm.predict(X_test, start_iteration=10)
res_sklearn_params = clf.predict_proba(X_test, res_sklearn_params = clf.predict_proba(X_test, pred_early_stop=True, pred_early_stop_margin=1.0, start_iteration=10)
pred_early_stop=True,
pred_early_stop_margin=1.0, start_iteration=10)
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
np.testing.assert_allclose(res_engine, res_sklearn_params) np.testing.assert_allclose(res_engine, res_sklearn_params)
...@@ -750,34 +724,43 @@ def test_predict_with_params_from_init(): ...@@ -750,34 +724,43 @@ def test_predict_with_params_from_init():
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
predict_params = { predict_params = {"pred_early_stop": True, "pred_early_stop_margin": 1.0}
'pred_early_stop': True,
'pred_early_stop_margin': 1.0
}
y_preds_no_params = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict( y_preds_no_params = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(X_test, raw_score=True)
X_test, raw_score=True)
y_preds_params_in_predict = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict( y_preds_params_in_predict = (
X_test, raw_score=True, **predict_params) lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(X_test, raw_score=True, **predict_params)
)
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
np.testing.assert_allclose(y_preds_no_params, y_preds_params_in_predict) np.testing.assert_allclose(y_preds_no_params, y_preds_params_in_predict)
y_preds_params_in_set_params_before_fit = lgb.LGBMClassifier(verbose=-1).set_params( y_preds_params_in_set_params_before_fit = (
**predict_params).fit(X_train, y_train).predict(X_test, raw_score=True) lgb.LGBMClassifier(verbose=-1)
.set_params(**predict_params)
.fit(X_train, y_train)
.predict(X_test, raw_score=True)
)
np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_before_fit) np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_before_fit)
y_preds_params_in_set_params_after_fit = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).set_params( y_preds_params_in_set_params_after_fit = (
**predict_params).predict(X_test, raw_score=True) lgb.LGBMClassifier(verbose=-1)
.fit(X_train, y_train)
.set_params(**predict_params)
.predict(X_test, raw_score=True)
)
np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_after_fit) np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_after_fit)
y_preds_params_in_init = lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict( y_preds_params_in_init = (
X_test, raw_score=True) lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict(X_test, raw_score=True)
)
np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_init) np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_init)
# test that params passed in predict have higher priority # test that params passed in predict have higher priority
y_preds_params_overwritten = lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict( y_preds_params_overwritten = (
X_test, raw_score=True, pred_early_stop=False) lgb.LGBMClassifier(verbose=-1, **predict_params)
.fit(X_train, y_train)
.predict(X_test, raw_score=True, pred_early_stop=False)
)
np.testing.assert_allclose(y_preds_no_params, y_preds_params_overwritten) np.testing.assert_allclose(y_preds_no_params, y_preds_params_overwritten)
...@@ -787,315 +770,307 @@ def test_evaluate_train_set(): ...@@ -787,315 +770,307 @@ def test_evaluate_train_set():
gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1) gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)]) gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
assert len(gbm.evals_result_) == 2 assert len(gbm.evals_result_) == 2
assert 'training' in gbm.evals_result_ assert "training" in gbm.evals_result_
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'l2' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert 'valid_1' in gbm.evals_result_ assert "valid_1" in gbm.evals_result_
assert len(gbm.evals_result_['valid_1']) == 1 assert len(gbm.evals_result_["valid_1"]) == 1
assert 'l2' in gbm.evals_result_['valid_1'] assert "l2" in gbm.evals_result_["valid_1"]
def test_metrics(): def test_metrics():
X, y = make_synthetic_regression() X, y = make_synthetic_regression()
y = abs(y) y = abs(y)
params = {'n_estimators': 2, 'verbose': -1} params = {"n_estimators": 2, "verbose": -1}
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)} params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# no custom objective, no custom metric # no custom objective, no custom metric
# default metric # default metric
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'l2' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
# non-default metric # non-default metric
gbm = lgb.LGBMRegressor(metric='mape', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(metric="mape", **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'mape' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
# no metric # no metric
gbm = lgb.LGBMRegressor(metric='None', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(metric="None", **params).fit(**params_fit)
assert gbm.evals_result_ == {} assert gbm.evals_result_ == {}
# non-default metric in eval_metric # non-default metric in eval_metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric='mape', **params_fit) gbm = lgb.LGBMRegressor(**params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_['training']) == 2 assert len(gbm.evals_result_["training"]) == 2
assert 'l2' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
# non-default metric with non-default metric in eval_metric # non-default metric with non-default metric in eval_metric
gbm = lgb.LGBMRegressor(metric='gamma', **params).fit(eval_metric='mape', **params_fit) gbm = lgb.LGBMRegressor(metric="gamma", **params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_['training']) == 2 assert len(gbm.evals_result_["training"]) == 2
assert 'gamma' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
# non-default metric with multiple metrics in eval_metric # non-default metric with multiple metrics in eval_metric
gbm = lgb.LGBMRegressor(metric='gamma', gbm = lgb.LGBMRegressor(metric="gamma", **params).fit(eval_metric=["l2", "mape"], **params_fit)
**params).fit(eval_metric=['l2', 'mape'], **params_fit) assert len(gbm.evals_result_["training"]) == 3
assert len(gbm.evals_result_['training']) == 3 assert "gamma" in gbm.evals_result_["training"]
assert 'gamma' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert 'l2' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training']
# non-default metric with multiple metrics in eval_metric for LGBMClassifier # non-default metric with multiple metrics in eval_metric for LGBMClassifier
X_classification, y_classification = load_breast_cancer(return_X_y=True) X_classification, y_classification = load_breast_cancer(return_X_y=True)
params_classification = {'n_estimators': 2, 'verbose': -1, params_classification = {"n_estimators": 2, "verbose": -1, "objective": "binary", "metric": "binary_logloss"}
'objective': 'binary', 'metric': 'binary_logloss'} params_fit_classification = {
params_fit_classification = {'X': X_classification, 'y': y_classification, "X": X_classification,
'eval_set': (X_classification, y_classification)} "y": y_classification,
gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=['fair', 'error'], "eval_set": (X_classification, y_classification),
**params_fit_classification) }
assert len(gbm.evals_result_['training']) == 3 gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=["fair", "error"], **params_fit_classification)
assert 'fair' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 3
assert 'binary_error' in gbm.evals_result_['training'] assert "fair" in gbm.evals_result_["training"]
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_error" in gbm.evals_result_["training"]
assert "binary_logloss" in gbm.evals_result_["training"]
# default metric for non-default objective # default metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'l1' in gbm.evals_result_['training'] assert "l1" in gbm.evals_result_["training"]
# non-default metric for non-default objective # non-default metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape', gbm = lgb.LGBMRegressor(objective="regression_l1", metric="mape", **params).fit(**params_fit)
**params).fit(**params_fit) assert len(gbm.evals_result_["training"]) == 1
assert len(gbm.evals_result_['training']) == 1 assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training']
# no metric # no metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None', gbm = lgb.LGBMRegressor(objective="regression_l1", metric="None", **params).fit(**params_fit)
**params).fit(**params_fit)
assert gbm.evals_result_ == {} assert gbm.evals_result_ == {}
# non-default metric in eval_metric for non-default objective # non-default metric in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(eval_metric="mape", **params_fit)
**params).fit(eval_metric='mape', **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "l1" in gbm.evals_result_["training"]
assert 'l1' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training']
# non-default metric with non-default metric in eval_metric for non-default objective # non-default metric with non-default metric in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma', gbm = lgb.LGBMRegressor(objective="regression_l1", metric="gamma", **params).fit(eval_metric="mape", **params_fit)
**params).fit(eval_metric='mape', **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "gamma" in gbm.evals_result_["training"]
assert 'gamma' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training']
# non-default metric with multiple metrics in eval_metric for non-default objective # non-default metric with multiple metrics in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma', gbm = lgb.LGBMRegressor(objective="regression_l1", metric="gamma", **params).fit(
**params).fit(eval_metric=['l2', 'mape'], **params_fit) eval_metric=["l2", "mape"], **params_fit
assert len(gbm.evals_result_['training']) == 3 )
assert 'gamma' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 3
assert 'l2' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# custom objective, no custom metric # custom objective, no custom metric
# default regression metric for custom objective # default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit) gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'l2' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
# non-default regression metric for custom objective # non-default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', **params).fit(**params_fit) gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'mape' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
# multiple regression metrics for custom objective # multiple regression metrics for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(**params_fit)
**params).fit(**params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "l1" in gbm.evals_result_["training"]
assert 'l1' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert 'gamma' in gbm.evals_result_['training']
# no metric # no metric
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='None', gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="None", **params).fit(**params_fit)
**params).fit(**params_fit)
assert gbm.evals_result_ == {} assert gbm.evals_result_ == {}
# default regression metric with non-default metric in eval_metric for custom objective # default regression metric with non-default metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(eval_metric="mape", **params_fit)
**params).fit(eval_metric='mape', **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "l2" in gbm.evals_result_["training"]
assert 'l2' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training']
# non-default regression metric with metric in eval_metric for custom objective # non-default regression metric with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(eval_metric="gamma", **params_fit)
**params).fit(eval_metric='gamma', **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert 'gamma' in gbm.evals_result_['training']
# multiple regression metrics with metric in eval_metric for custom objective # multiple regression metrics with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(
**params).fit(eval_metric='l2', **params_fit) eval_metric="l2", **params_fit
assert len(gbm.evals_result_['training']) == 3 )
assert 'l1' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 3
assert 'gamma' in gbm.evals_result_['training'] assert "l1" in gbm.evals_result_["training"]
assert 'l2' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert "l2" in gbm.evals_result_["training"]
# multiple regression metrics with multiple metrics in eval_metric for custom objective # multiple regression metrics with multiple metrics in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(
**params).fit(eval_metric=['l2', 'mape'], **params_fit) eval_metric=["l2", "mape"], **params_fit
assert len(gbm.evals_result_['training']) == 4 )
assert 'l1' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 4
assert 'gamma' in gbm.evals_result_['training'] assert "l1" in gbm.evals_result_["training"]
assert 'l2' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# no custom objective, custom metric # no custom objective, custom metric
# default metric with custom metric # default metric with custom metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit) gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2 assert len(gbm.evals_result_["training"]) == 2
assert 'l2' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training'] assert "error" in gbm.evals_result_["training"]
# non-default metric with custom metric # non-default metric with custom metric
gbm = lgb.LGBMRegressor(metric='mape', gbm = lgb.LGBMRegressor(metric="mape", **params).fit(eval_metric=constant_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "mape" in gbm.evals_result_["training"]
assert 'mape' in gbm.evals_result_['training'] assert "error" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training']
# multiple metrics with custom metric # multiple metrics with custom metric
gbm = lgb.LGBMRegressor(metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(metric=["l1", "gamma"], **params).fit(eval_metric=constant_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit) assert len(gbm.evals_result_["training"]) == 3
assert len(gbm.evals_result_['training']) == 3 assert "l1" in gbm.evals_result_["training"]
assert 'l1' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert 'gamma' in gbm.evals_result_['training'] assert "error" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training']
# custom metric (disable default metric) # custom metric (disable default metric)
gbm = lgb.LGBMRegressor(metric='None', gbm = lgb.LGBMRegressor(metric="None", **params).fit(eval_metric=constant_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit) assert len(gbm.evals_result_["training"]) == 1
assert len(gbm.evals_result_['training']) == 1 assert "error" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training']
# default metric for non-default objective with custom metric # default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(eval_metric=constant_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "l1" in gbm.evals_result_["training"]
assert 'l1' in gbm.evals_result_['training'] assert "error" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training']
# non-default metric for non-default objective with custom metric # non-default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape', gbm = lgb.LGBMRegressor(objective="regression_l1", metric="mape", **params).fit(
**params).fit(eval_metric=constant_metric, **params_fit) eval_metric=constant_metric, **params_fit
assert len(gbm.evals_result_['training']) == 2 )
assert 'mape' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 2
assert 'error' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# multiple metrics for non-default objective with custom metric # multiple metrics for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric=['l1', 'gamma'], gbm = lgb.LGBMRegressor(objective="regression_l1", metric=["l1", "gamma"], **params).fit(
**params).fit(eval_metric=constant_metric, **params_fit) eval_metric=constant_metric, **params_fit
assert len(gbm.evals_result_['training']) == 3 )
assert 'l1' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 3
assert 'gamma' in gbm.evals_result_['training'] assert "l1" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training'] assert "gamma" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# custom metric (disable default metric for non-default objective) # custom metric (disable default metric for non-default objective)
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None', gbm = lgb.LGBMRegressor(objective="regression_l1", metric="None", **params).fit(
**params).fit(eval_metric=constant_metric, **params_fit) eval_metric=constant_metric, **params_fit
assert len(gbm.evals_result_['training']) == 1 )
assert 'error' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 1
assert "error" in gbm.evals_result_["training"]
# custom objective, custom metric # custom objective, custom metric
# custom metric for custom objective # custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(eval_metric=constant_metric, **params_fit)
**params).fit(eval_metric=constant_metric, **params_fit) assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "error" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training']
# non-default regression metric with custom metric for custom objective # non-default regression metric with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(
**params).fit(eval_metric=constant_metric, **params_fit) eval_metric=constant_metric, **params_fit
assert len(gbm.evals_result_['training']) == 2 )
assert 'mape' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 2
assert 'error' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# multiple regression metrics with custom metric for custom objective # multiple regression metrics with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l2', 'mape'], gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l2", "mape"], **params).fit(
**params).fit(eval_metric=constant_metric, **params_fit) eval_metric=constant_metric, **params_fit
assert len(gbm.evals_result_['training']) == 3 )
assert 'l2' in gbm.evals_result_['training'] assert len(gbm.evals_result_["training"]) == 3
assert 'mape' in gbm.evals_result_['training'] assert "l2" in gbm.evals_result_["training"]
assert 'error' in gbm.evals_result_['training'] assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
X, y = load_digits(n_class=3, return_X_y=True) X, y = load_digits(n_class=3, return_X_y=True)
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)} params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# default metric and invalid binary metric is replaced with multiclass alternative # default metric and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='binary_error', **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric="binary_error", **params_fit)
assert len(gbm.evals_result_['training']) == 2 assert len(gbm.evals_result_["training"]) == 2
assert 'multi_logloss' in gbm.evals_result_['training'] assert "multi_logloss" in gbm.evals_result_["training"]
assert 'multi_error' in gbm.evals_result_['training'] assert "multi_error" in gbm.evals_result_["training"]
# invalid binary metric is replaced with multiclass alternative # invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='binary_error', **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric="binary_error", **params_fit)
assert gbm.objective_ == 'multiclass' assert gbm.objective_ == "multiclass"
assert len(gbm.evals_result_['training']) == 2 assert len(gbm.evals_result_["training"]) == 2
assert 'multi_logloss' in gbm.evals_result_['training'] assert "multi_logloss" in gbm.evals_result_["training"]
assert 'multi_error' in gbm.evals_result_['training'] assert "multi_error" in gbm.evals_result_["training"]
# default metric for non-default multiclass objective # default metric for non-default multiclass objective
# and invalid binary metric is replaced with multiclass alternative # and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(objective='ovr', gbm = lgb.LGBMClassifier(objective="ovr", **params).fit(eval_metric="binary_error", **params_fit)
**params).fit(eval_metric='binary_error', **params_fit) assert gbm.objective_ == "ovr"
assert gbm.objective_ == 'ovr' assert len(gbm.evals_result_["training"]) == 2
assert len(gbm.evals_result_['training']) == 2 assert "multi_logloss" in gbm.evals_result_["training"]
assert 'multi_logloss' in gbm.evals_result_['training'] assert "multi_error" in gbm.evals_result_["training"]
assert 'multi_error' in gbm.evals_result_['training']
X, y = load_digits(n_class=2, return_X_y=True) X, y = load_digits(n_class=2, return_X_y=True)
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)} params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# default metric and invalid multiclass metric is replaced with binary alternative # default metric and invalid multiclass metric is replaced with binary alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='multi_error', **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric="multi_error", **params_fit)
assert len(gbm.evals_result_['training']) == 2 assert len(gbm.evals_result_["training"]) == 2
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_logloss" in gbm.evals_result_["training"]
assert 'binary_error' in gbm.evals_result_['training'] assert "binary_error" in gbm.evals_result_["training"]
# invalid multiclass metric is replaced with binary alternative for custom objective # invalid multiclass metric is replaced with binary alternative for custom objective
gbm = lgb.LGBMClassifier(objective=custom_dummy_obj, gbm = lgb.LGBMClassifier(objective=custom_dummy_obj, **params).fit(eval_metric="multi_logloss", **params_fit)
**params).fit(eval_metric='multi_logloss', **params_fit) assert len(gbm.evals_result_["training"]) == 1
assert len(gbm.evals_result_['training']) == 1 assert "binary_logloss" in gbm.evals_result_["training"]
assert 'binary_logloss' in gbm.evals_result_['training']
def test_multiple_eval_metrics(): def test_multiple_eval_metrics():
X, y = load_breast_cancer(return_X_y=True) X, y = load_breast_cancer(return_X_y=True)
params = {'n_estimators': 2, 'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'} params = {"n_estimators": 2, "verbose": -1, "objective": "binary", "metric": "binary_logloss"}
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)} params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# Verify that can receive a list of metrics, only callable # Verify that can receive a list of metrics, only callable
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric], **params_fit)
assert len(gbm.evals_result_['training']) == 3 assert len(gbm.evals_result_["training"]) == 3
assert 'error' in gbm.evals_result_['training'] assert "error" in gbm.evals_result_["training"]
assert 'decreasing_metric' in gbm.evals_result_['training'] assert "decreasing_metric" in gbm.evals_result_["training"]
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_logloss" in gbm.evals_result_["training"]
# Verify that can receive a list of custom and built-in metrics # Verify that can receive a list of custom and built-in metrics
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, 'fair'], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, "fair"], **params_fit)
assert len(gbm.evals_result_['training']) == 4 assert len(gbm.evals_result_["training"]) == 4
assert 'error' in gbm.evals_result_['training'] assert "error" in gbm.evals_result_["training"]
assert 'decreasing_metric' in gbm.evals_result_['training'] assert "decreasing_metric" in gbm.evals_result_["training"]
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_logloss" in gbm.evals_result_["training"]
assert 'fair' in gbm.evals_result_['training'] assert "fair" in gbm.evals_result_["training"]
# Verify that works as expected when eval_metric is empty # Verify that works as expected when eval_metric is empty
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[], **params_fit)
assert len(gbm.evals_result_['training']) == 1 assert len(gbm.evals_result_["training"]) == 1
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_logloss" in gbm.evals_result_["training"]
# Verify that can receive a list of metrics, only built-in # Verify that can receive a list of metrics, only built-in
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error'], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=["fair", "error"], **params_fit)
assert len(gbm.evals_result_['training']) == 3 assert len(gbm.evals_result_["training"]) == 3
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_logloss" in gbm.evals_result_["training"]
# Verify that eval_metric is robust to receiving a list with None # Verify that eval_metric is robust to receiving a list with None
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error', None], **params_fit) gbm = lgb.LGBMClassifier(**params).fit(eval_metric=["fair", "error", None], **params_fit)
assert len(gbm.evals_result_['training']) == 3 assert len(gbm.evals_result_["training"]) == 3
assert 'binary_logloss' in gbm.evals_result_['training'] assert "binary_logloss" in gbm.evals_result_["training"]
def test_nan_handle(): def test_nan_handle():
...@@ -1104,18 +1079,18 @@ def test_nan_handle(): ...@@ -1104,18 +1079,18 @@ def test_nan_handle():
X = np.random.randn(nrows, ncols) X = np.random.randn(nrows, ncols)
y = np.random.randn(nrows) + np.full(nrows, 1e30) y = np.random.randn(nrows) + np.full(nrows, 1e30)
weight = np.zeros(nrows) weight = np.zeros(nrows)
params = {'n_estimators': 20, 'verbose': -1} params = {"n_estimators": 20, "verbose": -1}
params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y), params_fit = {"X": X, "y": y, "sample_weight": weight, "eval_set": (X, y), "callbacks": [lgb.early_stopping(5)]}
'callbacks': [lgb.early_stopping(5)]}
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan) np.testing.assert_allclose(gbm.evals_result_["training"]["l2"], np.nan)
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') @pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_first_metric_only(): def test_first_metric_only():
def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only): def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
params['first_metric_only'] = first_metric_only params["first_metric_only"] = first_metric_only
gbm = lgb.LGBMRegressor(**params).fit(**params_fit) gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
assert len(gbm.evals_result_) == len(eval_set_names) assert len(gbm.evals_result_) == len(eval_set_names)
for eval_set_name in eval_set_names: for eval_set_name in eval_set_names:
...@@ -1125,11 +1100,13 @@ def test_first_metric_only(): ...@@ -1125,11 +1100,13 @@ def test_first_metric_only():
assert metric_name in gbm.evals_result_[eval_set_name] assert metric_name in gbm.evals_result_[eval_set_name]
actual = len(gbm.evals_result_[eval_set_name][metric_name]) actual = len(gbm.evals_result_[eval_set_name][metric_name])
expected = assumed_iteration + (params['early_stopping_rounds'] expected = assumed_iteration + (
if eval_set_name != 'training' params["early_stopping_rounds"]
and assumed_iteration != gbm.n_estimators else 0) if eval_set_name != "training" and assumed_iteration != gbm.n_estimators
else 0
)
assert expected == actual assert expected == actual
if eval_set_name != 'training': if eval_set_name != "training":
assert assumed_iteration == gbm.best_iteration_ assert assumed_iteration == gbm.best_iteration_
else: else:
assert gbm.n_estimators == gbm.best_iteration_ assert gbm.n_estimators == gbm.best_iteration_
...@@ -1137,14 +1114,15 @@ def test_first_metric_only(): ...@@ -1137,14 +1114,15 @@ def test_first_metric_only():
X, y = make_synthetic_regression(n_samples=300) X, y = make_synthetic_regression(n_samples=300)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72) X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72)
params = {'n_estimators': 30, params = {
'learning_rate': 0.8, "n_estimators": 30,
'num_leaves': 15, "learning_rate": 0.8,
'verbose': -1, "num_leaves": 15,
'seed': 123, "verbose": -1,
'early_stopping_rounds': 5} # early stop should be supported via global LightGBM parameter "seed": 123,
params_fit = {'X': X_train, "early_stopping_rounds": 5,
'y': y_train} } # early stop should be supported via global LightGBM parameter
params_fit = {"X": X_train, "y": y_train}
iter_valid1_l1 = 4 iter_valid1_l1 = 4
iter_valid1_l2 = 4 iter_valid1_l2 = 4
...@@ -1157,100 +1135,116 @@ def test_first_metric_only(): ...@@ -1157,100 +1135,116 @@ def test_first_metric_only():
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2]) iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
# feval # feval
params['metric'] = 'None' params["metric"] = "None"
params_fit['eval_metric'] = lambda preds, train_data: [decreasing_metric(preds, train_data), params_fit["eval_metric"] = lambda preds, train_data: [
constant_metric(preds, train_data)] decreasing_metric(preds, train_data),
params_fit['eval_set'] = (X_test1, y_test1) constant_metric(preds, train_data),
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 1, False) ]
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 30, True) params_fit["eval_set"] = (X_test1, y_test1)
params_fit['eval_metric'] = lambda preds, train_data: [constant_metric(preds, train_data), fit_and_check(["valid_0"], ["decreasing_metric", "error"], 1, False)
decreasing_metric(preds, train_data)] fit_and_check(["valid_0"], ["decreasing_metric", "error"], 30, True)
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 1, True) params_fit["eval_metric"] = lambda preds, train_data: [
constant_metric(preds, train_data),
decreasing_metric(preds, train_data),
]
fit_and_check(["valid_0"], ["decreasing_metric", "error"], 1, True)
# single eval_set # single eval_set
params.pop('metric') params.pop("metric")
params_fit.pop('eval_metric') params_fit.pop("eval_metric")
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, False) fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, True) fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
params_fit['eval_metric'] = "l2" params_fit["eval_metric"] = "l2"
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, False) fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, True) fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
params_fit['eval_metric'] = "l1" params_fit["eval_metric"] = "l1"
fit_and_check(['valid_0'], ['l1', 'l2'], iter_min_valid1, False) fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
fit_and_check(['valid_0'], ['l1', 'l2'], iter_valid1_l1, True) fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l1, True)
params_fit['eval_metric'] = ["l1", "l2"] params_fit["eval_metric"] = ["l1", "l2"]
fit_and_check(['valid_0'], ['l1', 'l2'], iter_min_valid1, False) fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
fit_and_check(['valid_0'], ['l1', 'l2'], iter_valid1_l1, True) fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l1, True)
params_fit['eval_metric'] = ["l2", "l1"] params_fit["eval_metric"] = ["l2", "l1"]
fit_and_check(['valid_0'], ['l1', 'l2'], iter_min_valid1, False) fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
fit_and_check(['valid_0'], ['l1', 'l2'], iter_valid1_l2, True) fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l2, True)
params_fit['eval_metric'] = ["l2", "regression", "mse"] # test aliases params_fit["eval_metric"] = ["l2", "regression", "mse"] # test aliases
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, False) fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, True) fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
# two eval_set # two eval_set
params_fit['eval_set'] = [(X_test1, y_test1), (X_test2, y_test2)] params_fit["eval_set"] = [(X_test1, y_test1), (X_test2, y_test2)]
params_fit['eval_metric'] = ["l1", "l2"] params_fit["eval_metric"] = ["l1", "l2"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l1, True) fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l1, True)
params_fit['eval_metric'] = ["l2", "l1"] params_fit["eval_metric"] = ["l2", "l1"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True) fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l2, True)
params_fit['eval_set'] = [(X_test2, y_test2), (X_test1, y_test1)] params_fit["eval_set"] = [(X_test2, y_test2), (X_test1, y_test1)]
params_fit['eval_metric'] = ["l1", "l2"] params_fit["eval_metric"] = ["l1", "l2"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min, False) fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min, False)
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l1, True) fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l1, True)
params_fit['eval_metric'] = ["l2", "l1"] params_fit["eval_metric"] = ["l2", "l1"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min, False) fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min, False)
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True) fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l2, True)
def test_class_weight(): def test_class_weight():
X, y = load_digits(n_class=10, return_X_y=True) X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_str = y_train.astype('str') y_train_str = y_train.astype("str")
y_test_str = y_test.astype('str') y_test_str = y_test.astype("str")
gbm = lgb.LGBMClassifier(n_estimators=10, class_weight='balanced', verbose=-1) gbm = lgb.LGBMClassifier(n_estimators=10, class_weight="balanced", verbose=-1)
gbm.fit(X_train, y_train, gbm.fit(
eval_set=[(X_train, y_train), (X_test, y_test), (X_test, y_test), X_train,
(X_test, y_test), (X_test, y_test)], y_train,
eval_class_weight=['balanced', None, 'balanced', {1: 10, 4: 20}, {5: 30, 2: 40}]) eval_set=[(X_train, y_train), (X_test, y_test), (X_test, y_test), (X_test, y_test), (X_test, y_test)],
eval_class_weight=["balanced", None, "balanced", {1: 10, 4: 20}, {5: 30, 2: 40}],
)
for eval_set1, eval_set2 in itertools.combinations(gbm.evals_result_.keys(), 2): for eval_set1, eval_set2 in itertools.combinations(gbm.evals_result_.keys(), 2):
for metric in gbm.evals_result_[eval_set1]: for metric in gbm.evals_result_[eval_set1]:
np.testing.assert_raises(AssertionError, np.testing.assert_raises(
np.testing.assert_allclose, AssertionError,
gbm.evals_result_[eval_set1][metric], np.testing.assert_allclose,
gbm.evals_result_[eval_set2][metric]) gbm.evals_result_[eval_set1][metric],
gbm_str = lgb.LGBMClassifier(n_estimators=10, class_weight='balanced', verbose=-1) gbm.evals_result_[eval_set2][metric],
gbm_str.fit(X_train, y_train_str, )
eval_set=[(X_train, y_train_str), (X_test, y_test_str), gbm_str = lgb.LGBMClassifier(n_estimators=10, class_weight="balanced", verbose=-1)
(X_test, y_test_str), (X_test, y_test_str), (X_test, y_test_str)], gbm_str.fit(
eval_class_weight=['balanced', None, 'balanced', {'1': 10, '4': 20}, {'5': 30, '2': 40}]) X_train,
y_train_str,
eval_set=[
(X_train, y_train_str),
(X_test, y_test_str),
(X_test, y_test_str),
(X_test, y_test_str),
(X_test, y_test_str),
],
eval_class_weight=["balanced", None, "balanced", {"1": 10, "4": 20}, {"5": 30, "2": 40}],
)
for eval_set1, eval_set2 in itertools.combinations(gbm_str.evals_result_.keys(), 2): for eval_set1, eval_set2 in itertools.combinations(gbm_str.evals_result_.keys(), 2):
for metric in gbm_str.evals_result_[eval_set1]: for metric in gbm_str.evals_result_[eval_set1]:
np.testing.assert_raises(AssertionError, np.testing.assert_raises(
np.testing.assert_allclose, AssertionError,
gbm_str.evals_result_[eval_set1][metric], np.testing.assert_allclose,
gbm_str.evals_result_[eval_set2][metric]) gbm_str.evals_result_[eval_set1][metric],
gbm_str.evals_result_[eval_set2][metric],
)
for eval_set in gbm.evals_result_: for eval_set in gbm.evals_result_:
for metric in gbm.evals_result_[eval_set]: for metric in gbm.evals_result_[eval_set]:
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], gbm_str.evals_result_[eval_set][metric])
gbm_str.evals_result_[eval_set][metric])
def test_continue_training_with_model(): def test_continue_training_with_model():
X, y = load_digits(n_class=3, return_X_y=True) X, y = load_digits(n_class=3, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test)) init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test))
gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), init_model=init_gbm)
init_model=init_gbm) assert len(init_gbm.evals_result_["valid_0"]["multi_logloss"]) == len(gbm.evals_result_["valid_0"]["multi_logloss"])
assert len(init_gbm.evals_result_['valid_0']['multi_logloss']) == len(gbm.evals_result_['valid_0']['multi_logloss']) assert len(init_gbm.evals_result_["valid_0"]["multi_logloss"]) == 5
assert len(init_gbm.evals_result_['valid_0']['multi_logloss']) == 5 assert gbm.evals_result_["valid_0"]["multi_logloss"][-1] < init_gbm.evals_result_["valid_0"]["multi_logloss"][-1]
assert gbm.evals_result_['valid_0']['multi_logloss'][-1] < init_gbm.evals_result_['valid_0']['multi_logloss'][-1]
def test_actual_number_of_trees(): def test_actual_number_of_trees():
...@@ -1288,20 +1282,16 @@ def test_sklearn_integration(estimator, check): ...@@ -1288,20 +1282,16 @@ def test_sklearn_integration(estimator, check):
check(estimator) check(estimator)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression']) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task): def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
pd = pytest.importorskip("pandas") pd = pytest.importorskip("pandas")
X, y, g = _create_data(task) X, y, g = _create_data(task)
X = pd.DataFrame(X) X = pd.DataFrame(X)
y_col_array = y.reshape(-1, 1) y_col_array = y.reshape(-1, 1)
params = { params = {"n_estimators": 1, "num_leaves": 3, "random_state": 0}
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0
}
model_factory = task_to_model_factory[task] model_factory = task_to_model_factory[task]
with pytest.warns(UserWarning, match='column-vector'): with pytest.warns(UserWarning, match="column-vector"):
if task == 'ranking': if task == "ranking":
model_1d = model_factory(**params).fit(X, y, group=g) model_1d = model_factory(**params).fit(X, y, group=g)
model_2d = model_factory(**params).fit(X, y_col_array, group=g) model_2d = model_factory(**params).fit(X, y_col_array, group=g)
else: else:
...@@ -1313,12 +1303,12 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task ...@@ -1313,12 +1303,12 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
np.testing.assert_array_equal(preds_1d, preds_2d) np.testing.assert_array_equal(preds_1d, preds_2d)
@pytest.mark.parametrize('use_weight', [True, False]) @pytest.mark.parametrize("use_weight", [True, False])
def test_multiclass_custom_objective(use_weight): def test_multiclass_custom_objective(use_weight):
centers = [[-4, -4], [4, 4], [-4, 4]] centers = [[-4, -4], [4, 4], [-4, 4]]
X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42) X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
weight = np.full_like(y, 2) if use_weight else None weight = np.full_like(y, 2) if use_weight else None
params = {'n_estimators': 10, 'num_leaves': 7} params = {"n_estimators": 10, "num_leaves": 7}
builtin_obj_model = lgb.LGBMClassifier(**params) builtin_obj_model = lgb.LGBMClassifier(**params)
builtin_obj_model.fit(X, y, sample_weight=weight) builtin_obj_model.fit(X, y, sample_weight=weight)
builtin_obj_preds = builtin_obj_model.predict_proba(X) builtin_obj_preds = builtin_obj_model.predict_proba(X)
...@@ -1332,11 +1322,11 @@ def test_multiclass_custom_objective(use_weight): ...@@ -1332,11 +1322,11 @@ def test_multiclass_custom_objective(use_weight):
assert callable(custom_obj_model.objective_) assert callable(custom_obj_model.objective_)
@pytest.mark.parametrize('use_weight', [True, False]) @pytest.mark.parametrize("use_weight", [True, False])
def test_multiclass_custom_eval(use_weight): def test_multiclass_custom_eval(use_weight):
def custom_eval(y_true, y_pred, weight): def custom_eval(y_true, y_pred, weight):
loss = log_loss(y_true, y_pred, sample_weight=weight) loss = log_loss(y_true, y_pred, sample_weight=weight)
return 'custom_logloss', loss, False return "custom_logloss", loss, False
centers = [[-4, -4], [4, 4], [-4, 4]] centers = [[-4, -4], [4, 4], [-4, 4]]
X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42) X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
...@@ -1348,27 +1338,25 @@ def test_multiclass_custom_eval(use_weight): ...@@ -1348,27 +1338,25 @@ def test_multiclass_custom_eval(use_weight):
else: else:
weight_train = None weight_train = None
weight_valid = None weight_valid = None
params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7} params = {"objective": "multiclass", "num_class": 3, "num_leaves": 7}
model = lgb.LGBMClassifier(**params) model = lgb.LGBMClassifier(**params)
model.fit( model.fit(
X_train, X_train,
y_train, y_train,
sample_weight=weight_train, sample_weight=weight_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_set=[(X_train, y_train), (X_valid, y_valid)],
eval_names=['train', 'valid'], eval_names=["train", "valid"],
eval_sample_weight=[weight_train, weight_valid], eval_sample_weight=[weight_train, weight_valid],
eval_metric=custom_eval, eval_metric=custom_eval,
) )
eval_result = model.evals_result_ eval_result = model.evals_result_
train_ds = (X_train, y_train, weight_train) train_ds = (X_train, y_train, weight_train)
valid_ds = (X_valid, y_valid, weight_valid) valid_ds = (X_valid, y_valid, weight_valid)
for key, (X, y_true, weight) in zip(['train', 'valid'], [train_ds, valid_ds]): for key, (X, y_true, weight) in zip(["train", "valid"], [train_ds, valid_ds]):
np.testing.assert_allclose( np.testing.assert_allclose(eval_result[key]["multi_logloss"], eval_result[key]["custom_logloss"])
eval_result[key]['multi_logloss'], eval_result[key]['custom_logloss']
)
y_pred = model.predict_proba(X) y_pred = model.predict_proba(X)
_, metric_value, _ = custom_eval(y_true, y_pred, weight) _, metric_value, _ = custom_eval(y_true, y_pred, weight)
np.testing.assert_allclose(metric_value, eval_result[key]['custom_logloss'][-1]) np.testing.assert_allclose(metric_value, eval_result[key]["custom_logloss"][-1])
def test_negative_n_jobs(tmp_path): def test_negative_n_jobs(tmp_path):
...@@ -1397,21 +1385,21 @@ def test_default_n_jobs(tmp_path): ...@@ -1397,21 +1385,21 @@ def test_default_n_jobs(tmp_path):
assert bool(re.search(rf"\[num_threads: {n_cores}\]", model_txt)) assert bool(re.search(rf"\[num_threads: {n_cores}\]", model_txt))
@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed') @pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression']) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
def test_validate_features(task): def test_validate_features(task):
X, y, g = _create_data(task, n_features=4) X, y, g = _create_data(task, n_features=4)
features = ['x1', 'x2', 'x3', 'x4'] features = ["x1", "x2", "x3", "x4"]
df = pd_DataFrame(X, columns=features) df = pd_DataFrame(X, columns=features)
model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1) model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1)
if task == 'ranking': if task == "ranking":
model.fit(df, y, group=g) model.fit(df, y, group=g)
else: else:
model.fit(df, y) model.fit(df, y)
assert model.feature_name_ == features assert model.feature_name_ == features
# try to predict with a different feature # try to predict with a different feature
df2 = df.rename(columns={'x2': 'z'}) df2 = df.rename(columns={"x2": "z"})
with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x2' at position 1 but found 'z'"): with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x2' at position 1 but found 'z'"):
model.predict(df2, validate_features=True) model.predict(df2, validate_features=True)
...@@ -1419,59 +1407,59 @@ def test_validate_features(task): ...@@ -1419,59 +1407,59 @@ def test_validate_features(task):
model.predict(df2, validate_features=False) model.predict(df2, validate_features=False)
@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame']) @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_Series', 'pd_DataFrame']) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'regression']) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task): def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task):
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED: if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
pytest.skip('pandas is not installed') pytest.skip("pandas is not installed")
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED: if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed') pytest.skip("datatable is not installed")
X, y, g = _create_data(task, n_samples=2_000) X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(np.random.randn(y.shape[0])) weights = np.abs(np.random.randn(y.shape[0]))
if task == 'binary-classification' or task == 'regression': if task == "binary-classification" or task == "regression":
init_score = np.full_like(y, np.mean(y)) init_score = np.full_like(y, np.mean(y))
elif task == 'multiclass-classification': elif task == "multiclass-classification":
init_score = np.outer(y, np.array([0.1, 0.2, 0.7])) init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
else: else:
raise ValueError(f"Unrecognized task '{task}'") raise ValueError(f"Unrecognized task '{task}'")
X_valid = X * 2 X_valid = X * 2
if X_type == 'dt_DataTable': if X_type == "dt_DataTable":
X = dt_DataTable(X) X = dt_DataTable(X)
elif X_type == 'list2d': elif X_type == "list2d":
X = X.tolist() X = X.tolist()
elif X_type == 'scipy_csc': elif X_type == "scipy_csc":
X = scipy.sparse.csc_matrix(X) X = scipy.sparse.csc_matrix(X)
elif X_type == 'scipy_csr': elif X_type == "scipy_csr":
X = scipy.sparse.csr_matrix(X) X = scipy.sparse.csr_matrix(X)
elif X_type == 'pd_DataFrame': elif X_type == "pd_DataFrame":
X = pd_DataFrame(X) X = pd_DataFrame(X)
elif X_type != 'numpy': elif X_type != "numpy":
raise ValueError(f"Unrecognized X_type: '{X_type}'") raise ValueError(f"Unrecognized X_type: '{X_type}'")
# make weights and init_score same types as y, just to avoid # make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases # a huge number of combinations and therefore test cases
if y_type == 'list1d': if y_type == "list1d":
y = y.tolist() y = y.tolist()
weights = weights.tolist() weights = weights.tolist()
init_score = init_score.tolist() init_score = init_score.tolist()
elif y_type == 'pd_DataFrame': elif y_type == "pd_DataFrame":
y = pd_DataFrame(y) y = pd_DataFrame(y)
weights = pd_Series(weights) weights = pd_Series(weights)
if task == 'multiclass-classification': if task == "multiclass-classification":
init_score = pd_DataFrame(init_score) init_score = pd_DataFrame(init_score)
else: else:
init_score = pd_Series(init_score) init_score = pd_Series(init_score)
elif y_type == 'pd_Series': elif y_type == "pd_Series":
y = pd_Series(y) y = pd_Series(y)
weights = pd_Series(weights) weights = pd_Series(weights)
if task == 'multiclass-classification': if task == "multiclass-classification":
init_score = pd_DataFrame(init_score) init_score = pd_DataFrame(init_score)
else: else:
init_score = pd_Series(init_score) init_score = pd_Series(init_score)
elif y_type != 'numpy': elif y_type != "numpy":
raise ValueError(f"Unrecognized y_type: '{y_type}'") raise ValueError(f"Unrecognized y_type: '{y_type}'")
model = task_to_model_factory[task](n_estimators=10, verbose=-1) model = task_to_model_factory[task](n_estimators=10, verbose=-1)
...@@ -1482,73 +1470,73 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data ...@@ -1482,73 +1470,73 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
init_score=init_score, init_score=init_score,
eval_set=[(X_valid, y)], eval_set=[(X_valid, y)],
eval_sample_weight=[weights], eval_sample_weight=[weights],
eval_init_score=[init_score] eval_init_score=[init_score],
) )
preds = model.predict(X) preds = model.predict(X)
if task == 'binary-classification': if task == "binary-classification":
assert accuracy_score(y, preds) >= 0.99 assert accuracy_score(y, preds) >= 0.99
elif task == 'multiclass-classification': elif task == "multiclass-classification":
assert accuracy_score(y, preds) >= 0.99 assert accuracy_score(y, preds) >= 0.99
elif task == 'regression': elif task == "regression":
assert r2_score(y, preds) > 0.86 assert r2_score(y, preds) > 0.86
else: else:
raise ValueError(f"Unrecognized task: '{task}'") raise ValueError(f"Unrecognized task: '{task}'")
@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame']) @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_DataFrame', 'pd_Series']) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
@pytest.mark.parametrize('g_type', ['list1d_float', 'list1d_int', 'numpy', 'pd_Series']) @pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type): def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type):
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED: if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
pytest.skip('pandas is not installed') pytest.skip("pandas is not installed")
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED: if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed') pytest.skip("datatable is not installed")
X, y, g = _create_data(task='ranking', n_samples=1_000) X, y, g = _create_data(task="ranking", n_samples=1_000)
weights = np.abs(np.random.randn(y.shape[0])) weights = np.abs(np.random.randn(y.shape[0]))
init_score = np.full_like(y, np.mean(y)) init_score = np.full_like(y, np.mean(y))
X_valid = X * 2 X_valid = X * 2
if X_type == 'dt_DataTable': if X_type == "dt_DataTable":
X = dt_DataTable(X) X = dt_DataTable(X)
elif X_type == 'list2d': elif X_type == "list2d":
X = X.tolist() X = X.tolist()
elif X_type == 'scipy_csc': elif X_type == "scipy_csc":
X = scipy.sparse.csc_matrix(X) X = scipy.sparse.csc_matrix(X)
elif X_type == 'scipy_csr': elif X_type == "scipy_csr":
X = scipy.sparse.csr_matrix(X) X = scipy.sparse.csr_matrix(X)
elif X_type == 'pd_DataFrame': elif X_type == "pd_DataFrame":
X = pd_DataFrame(X) X = pd_DataFrame(X)
elif X_type != 'numpy': elif X_type != "numpy":
raise ValueError(f"Unrecognized X_type: '{X_type}'") raise ValueError(f"Unrecognized X_type: '{X_type}'")
# make weights and init_score same types as y, just to avoid # make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases # a huge number of combinations and therefore test cases
if y_type == 'list1d': if y_type == "list1d":
y = y.tolist() y = y.tolist()
weights = weights.tolist() weights = weights.tolist()
init_score = init_score.tolist() init_score = init_score.tolist()
elif y_type == 'pd_DataFrame': elif y_type == "pd_DataFrame":
y = pd_DataFrame(y) y = pd_DataFrame(y)
weights = pd_Series(weights) weights = pd_Series(weights)
init_score = pd_Series(init_score) init_score = pd_Series(init_score)
elif y_type == 'pd_Series': elif y_type == "pd_Series":
y = pd_Series(y) y = pd_Series(y)
weights = pd_Series(weights) weights = pd_Series(weights)
init_score = pd_Series(init_score) init_score = pd_Series(init_score)
elif y_type != 'numpy': elif y_type != "numpy":
raise ValueError(f"Unrecognized y_type: '{y_type}'") raise ValueError(f"Unrecognized y_type: '{y_type}'")
if g_type == 'list1d_float': if g_type == "list1d_float":
g = g.astype("float").tolist() g = g.astype("float").tolist()
elif g_type == 'list1d_int': elif g_type == "list1d_int":
g = g.astype("int").tolist() g = g.astype("int").tolist()
elif g_type == 'pd_Series': elif g_type == "pd_Series":
g = pd_Series(g) g = pd_Series(g)
elif g_type != 'numpy': elif g_type != "numpy":
raise ValueError(f"Unrecognized g_type: '{g_type}'") raise ValueError(f"Unrecognized g_type: '{g_type}'")
model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1) model = task_to_model_factory["ranking"](n_estimators=10, verbose=-1)
model.fit( model.fit(
X=X, X=X,
y=y, y=y,
...@@ -1558,7 +1546,7 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type ...@@ -1558,7 +1546,7 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
eval_set=[(X_valid, y)], eval_set=[(X_valid, y)],
eval_sample_weight=[weights], eval_sample_weight=[weights],
eval_init_score=[init_score], eval_init_score=[init_score],
eval_group=[g] eval_group=[g],
) )
preds = model.predict(X) preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99 assert spearmanr(preds, y).correlation >= 0.99
...@@ -1570,7 +1558,7 @@ def test_classifier_fit_detects_classes_every_time(): ...@@ -1570,7 +1558,7 @@ def test_classifier_fit_detects_classes_every_time():
ncols = 20 ncols = 20
X = rng.standard_normal(size=(nrows, ncols)) X = rng.standard_normal(size=(nrows, ncols))
y_bin = (rng.random(size=nrows) <= .3).astype(np.float64) y_bin = (rng.random(size=nrows) <= 0.3).astype(np.float64)
y_multi = rng.integers(4, size=nrows) y_multi = rng.integers(4, size=nrows)
model = lgb.LGBMClassifier(verbose=-1) model = lgb.LGBMClassifier(verbose=-1)
......
...@@ -10,7 +10,7 @@ import lightgbm as lgb ...@@ -10,7 +10,7 @@ import lightgbm as lgb
def test_register_logger(tmp_path): def test_register_logger(tmp_path):
logger = logging.getLogger("LightGBM") logger = logging.getLogger("LightGBM")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s | %(message)s') formatter = logging.Formatter("%(levelname)s | %(message)s")
log_filename = tmp_path / "LightGBM_test_logger.log" log_filename = tmp_path / "LightGBM_test_logger.log"
file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8") file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8")
file_handler.setLevel(logging.DEBUG) file_handler.setLevel(logging.DEBUG)
...@@ -18,29 +18,27 @@ def test_register_logger(tmp_path): ...@@ -18,29 +18,27 @@ def test_register_logger(tmp_path):
logger.addHandler(file_handler) logger.addHandler(file_handler)
def dummy_metric(_, __): def dummy_metric(_, __):
logger.debug('In dummy_metric') logger.debug("In dummy_metric")
return 'dummy_metric', 1, True return "dummy_metric", 1, True
lgb.register_logger(logger) lgb.register_logger(logger)
X = np.array([[1, 2, 3], X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
[1, 2, 4],
[1, 2, 4],
[1, 2, 3]],
dtype=np.float32)
y = np.array([0, 1, 1, 0]) y = np.array([0, 1, 1, 0])
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
lgb_valid = lgb.Dataset(X, y) # different object for early-stopping lgb_valid = lgb.Dataset(X, y) # different object for early-stopping
eval_records = {} eval_records = {}
callbacks = [ callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)]
lgb.record_evaluation(eval_records), lgb.train(
lgb.log_evaluation(2), {"objective": "binary", "metric": ["auc", "binary_error"]},
lgb.early_stopping(10) lgb_train,
] num_boost_round=10,
lgb.train({'objective': 'binary', 'metric': ['auc', 'binary_error']}, feval=dummy_metric,
lgb_train, num_boost_round=10, feval=dummy_metric, valid_sets=[lgb_valid],
valid_sets=[lgb_valid], categorical_feature=[1], callbacks=callbacks) categorical_feature=[1],
callbacks=callbacks,
)
lgb.plot_metric(eval_records) lgb.plot_metric(eval_records)
...@@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot. ...@@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot.
"INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found", "INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
"INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.", "INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
"INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!",
] ]
cuda_lines = [ cuda_lines = [
"INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.", "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.",
...@@ -142,11 +140,7 @@ def test_register_custom_logger(): ...@@ -142,11 +140,7 @@ def test_register_custom_logger():
logged_messages.append(msg) logged_messages.append(msg)
custom_logger = CustomLogger() custom_logger = CustomLogger()
lgb.register_logger( lgb.register_logger(custom_logger, info_method_name="custom_info", warning_method_name="custom_warning")
custom_logger,
info_method_name="custom_info",
warning_method_name="custom_warning"
)
lgb.basic._log_info("info message") lgb.basic._log_info("info message")
lgb.basic._log_warning("warning message") lgb.basic._log_warning("warning message")
...@@ -155,18 +149,14 @@ def test_register_custom_logger(): ...@@ -155,18 +149,14 @@ def test_register_custom_logger():
assert logged_messages == expected_log assert logged_messages == expected_log
logged_messages = [] logged_messages = []
X = np.array([[1, 2, 3], X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
[1, 2, 4],
[1, 2, 4],
[1, 2, 3]],
dtype=np.float32)
y = np.array([0, 1, 1, 0]) y = np.array([0, 1, 1, 0])
lgb_data = lgb.Dataset(X, y) lgb_data = lgb.Dataset(X, y)
lgb.train( lgb.train(
{'objective': 'binary', 'metric': 'auc'}, {"objective": "binary", "metric": "auc"},
lgb_data, lgb_data,
num_boost_round=10, num_boost_round=10,
valid_sets=[lgb_data], valid_sets=[lgb_data],
categorical_feature=[1] categorical_feature=[1],
) )
assert logged_messages, "custom logger was not called" assert logged_messages, "custom logger was not called"
...@@ -34,8 +34,9 @@ def load_linnerud(**kwargs): ...@@ -34,8 +34,9 @@ def load_linnerud(**kwargs):
return sklearn.datasets.load_linnerud(**kwargs) return sklearn.datasets.load_linnerud(**kwargs)
def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2, def make_ranking(
group=None, random_gs=False, avg_gs=10, random_state=0): n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0
):
"""Generate a learning-to-rank dataset - feature vectors grouped together with """Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module. if ranking objective becomes supported in sklearn.datasets module.
...@@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2, ...@@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
relvalues = range(gmax + 1) relvalues = range(gmax + 1)
# build y/target and group-id vectors with user-specified group sizes. # build y/target and group-id vectors with user-specified group sizes.
if group is not None and hasattr(group, '__len__'): if group is not None and hasattr(group, "__len__"):
n_samples = np.sum(group) n_samples = np.sum(group)
for i, gsize in enumerate(group): for i, gsize in enumerate(group):
...@@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2, ...@@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42): def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
return sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features, return sklearn.datasets.make_regression(
n_informative=n_informative, random_state=random_state) n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state
)
def dummy_obj(preds, train_data): def dummy_obj(preds, train_data):
...@@ -126,7 +128,7 @@ def dummy_obj(preds, train_data): ...@@ -126,7 +128,7 @@ def dummy_obj(preds, train_data):
def mse_obj(y_pred, dtrain): def mse_obj(y_pred, dtrain):
y_true = dtrain.get_label() y_true = dtrain.get_label()
grad = (y_pred - y_true) grad = y_pred - y_true
hess = np.ones(len(grad)) hess = np.ones(len(grad))
return grad, hess return grad, hess
...@@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None): ...@@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
def pickle_obj(obj, filepath, serializer): def pickle_obj(obj, filepath, serializer):
if serializer == 'pickle': if serializer == "pickle":
with open(filepath, 'wb') as f: with open(filepath, "wb") as f:
pickle.dump(obj, f) pickle.dump(obj, f)
elif serializer == 'joblib': elif serializer == "joblib":
joblib.dump(obj, filepath) joblib.dump(obj, filepath)
elif serializer == 'cloudpickle': elif serializer == "cloudpickle":
with open(filepath, 'wb') as f: with open(filepath, "wb") as f:
cloudpickle.dump(obj, f) cloudpickle.dump(obj, f)
else: else:
raise ValueError(f'Unrecognized serializer type: {serializer}') raise ValueError(f"Unrecognized serializer type: {serializer}")
def unpickle_obj(filepath, serializer): def unpickle_obj(filepath, serializer):
if serializer == 'pickle': if serializer == "pickle":
with open(filepath, 'rb') as f: with open(filepath, "rb") as f:
return pickle.load(f) return pickle.load(f)
elif serializer == 'joblib': elif serializer == "joblib":
return joblib.load(filepath) return joblib.load(filepath)
elif serializer == 'cloudpickle': elif serializer == "cloudpickle":
with open(filepath, 'rb') as f: with open(filepath, "rb") as f:
return cloudpickle.load(f) return cloudpickle.load(f)
else: else:
raise ValueError(f'Unrecognized serializer type: {serializer}') raise ValueError(f"Unrecognized serializer type: {serializer}")
def pickle_and_unpickle_object(obj, serializer): def pickle_and_unpickle_object(obj, serializer):
with lgb.basic._TempFile() as tmp_file: with lgb.basic._TempFile() as tmp_file:
pickle_obj( pickle_obj(obj=obj, filepath=tmp_file.name, serializer=serializer)
obj=obj, obj_from_disk = unpickle_obj(filepath=tmp_file.name, serializer=serializer)
filepath=tmp_file.name,
serializer=serializer
)
obj_from_disk = unpickle_obj(
filepath=tmp_file.name,
serializer=serializer
)
return obj_from_disk # noqa: RET504 return obj_from_disk # noqa: RET504
# doing this here, at import time, to ensure it only runs once_per import # doing this here, at import time, to ensure it only runs once_per import
# instead of once per assertion # instead of once per assertion
_numpy_testing_supports_strict_kwarg = ( _numpy_testing_supports_strict_kwarg = "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
"strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
)
def np_assert_array_equal(*args, **kwargs): def np_assert_array_equal(*args, **kwargs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment