Unverified Commit 1b792e71 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[ci] [python-package] enable ruff-format on tests and examples (#6317)

parent b60068c8
......@@ -19,8 +19,9 @@ from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
def test_basic(tmp_path):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True),
test_size=0.1, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
feature_names[1] = "a" * 1000 # set one name to a value longer than default buffer size
train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
......@@ -34,7 +35,7 @@ def test_basic(tmp_path):
"verbose": -1,
"num_threads": 1,
"max_bin": 255,
"gpu_use_dp": True
"gpu_use_dp": True,
}
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1")
......@@ -49,7 +50,7 @@ def test_basic(tmp_path):
assert bst.current_iteration() == 20
assert bst.num_trees() == 20
assert bst.num_model_per_iteration() == 1
if getenv('TASK', '') != 'cuda':
if getenv("TASK", "") != "cuda":
assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
assert bst.upper_bound() == pytest.approx(3.3182142872462883)
......@@ -79,20 +80,19 @@ def test_basic(tmp_path):
# test that shape is checked during prediction
bad_X_test = X_test[:, 1:]
bad_shape_error_msg = "The number of features in data*"
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, bad_X_test)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, sparse.csr_matrix(bad_X_test))
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, sparse.csc_matrix(bad_X_test))
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
np.testing.assert_raises_regex(
lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
)
np.testing.assert_raises_regex(
lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
)
with open(tname, "w+b") as f:
dump_svmlight_file(bad_X_test, y_test, f)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, tname)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
with open(tname, "w+b") as f:
dump_svmlight_file(X_test, y_test, f, zero_based=False)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
bst.predict, tname)
np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
class NumpySequence(lgb.Sequence):
......@@ -108,7 +108,7 @@ class NumpySequence(lgb.Sequence):
elif isinstance(idx, slice):
if not (idx.step is None or idx.step == 1):
raise NotImplementedError("No need to implement, caller will not set step by now")
return self.ndarray[idx.start:idx.stop]
return self.ndarray[idx.start : idx.stop]
elif isinstance(idx, list):
return self.ndarray[idx]
else:
......@@ -132,12 +132,12 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
return seqs
@pytest.mark.parametrize('sample_count', [11, 100, None])
@pytest.mark.parametrize('batch_size', [3, None])
@pytest.mark.parametrize('include_0_and_nan', [False, True])
@pytest.mark.parametrize('num_seq', [1, 3])
@pytest.mark.parametrize("sample_count", [11, 100, None])
@pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize("num_seq", [1, 3])
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
params = {'bin_construct_sample_cnt': sample_count}
params = {"bin_construct_sample_cnt": sample_count}
nrow = 50
half_nrow = nrow // 2
......@@ -159,8 +159,8 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
X = data[:, :-1]
Y = data[:, -1]
npy_bin_fname = tmpdir / 'data_from_npy.bin'
seq_bin_fname = tmpdir / 'data_from_seq.bin'
npy_bin_fname = tmpdir / "data_from_npy.bin"
seq_bin_fname = tmpdir / "data_from_seq.bin"
# Create dataset from numpy array directly.
ds = lgb.Dataset(X, label=Y, params=params)
......@@ -181,9 +181,9 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
valid_X = valid_data[:, :-1]
valid_Y = valid_data[:, -1]
valid_npy_bin_fname = tmpdir / 'valid_data_from_npy.bin'
valid_seq_bin_fname = tmpdir / 'valid_data_from_seq.bin'
valid_seq2_bin_fname = tmpdir / 'valid_data_from_seq2.bin'
valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
valid_ds.save_binary(valid_npy_bin_fname)
......@@ -200,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)
@pytest.mark.parametrize('num_seq', [1, 2])
@pytest.mark.parametrize("num_seq", [1, 2])
def test_sequence_get_data(num_seq):
nrow = 20
ncol = 11
......@@ -218,12 +218,13 @@ def test_sequence_get_data(num_seq):
def test_chunked_dataset():
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
random_state=2)
X_train, X_test, y_train, y_test = train_test_split(
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
......@@ -232,12 +233,13 @@ def test_chunked_dataset():
def test_chunked_dataset_linear():
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1,
random_state=2)
X_train, X_test, y_train, y_test = train_test_split(
*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
)
chunk_size = X_train.shape[0] // 10 + 1
X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
params = {"bin_construct_sample_cnt": 100, 'linear_tree': True}
X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
train_data = lgb.Dataset(X_train, label=y_train, params=params)
valid_data = train_data.create_valid(X_test, label=y_test, params=params)
train_data.construct()
......@@ -246,16 +248,16 @@ def test_chunked_dataset_linear():
def test_save_dataset_subset_and_load_from_file(tmp_path):
data = np.random.rand(100, 2)
params = {'max_bin': 50, 'min_data_in_bin': 10}
params = {"max_bin": 50, "min_data_in_bin": 10}
ds = lgb.Dataset(data, params=params)
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / 'subset.bin')
lgb.Dataset(tmp_path / 'subset.bin', params=params).construct()
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
def test_subset_group():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
assert len(lgb_train.get_group()) == 201
subset = lgb_train.subset(list(range(10))).construct()
......@@ -294,7 +296,7 @@ def test_add_features_throws_if_datasets_unconstructed():
def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
......@@ -304,9 +306,9 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
d = lgb.Dataset(X, feature_name=names).construct()
dname = tmp_path / "d.txt"
d._dump_text(dname)
with open(d1name, 'rt') as d1f:
with open(d1name, "rt") as d1f:
d1txt = d1f.read()
with open(dname, 'rt') as df:
with open(dname, "rt") as df:
dtxt = df.read()
assert dtxt == d1txt
......@@ -314,7 +316,7 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
def test_add_features_same_booster_behaviour(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
for j in range(1, 5):
d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
......@@ -332,9 +334,9 @@ def test_add_features_same_booster_behaviour(tmp_path):
d1name = tmp_path / "d1.txt"
b1.save_model(d1name)
b.save_model(dname)
with open(dname, 'rt') as df:
with open(dname, "rt") as df:
dtxt = df.read()
with open(d1name, 'rt') as d1f:
with open(d1name, "rt") as d1f:
d1txt = d1f.read()
assert dtxt == d1txt
......@@ -345,11 +347,12 @@ def test_add_features_from_different_sources():
n_col = 5
X = np.random.random((n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
names = [f'col_{i}' for i in range(n_col)]
names = [f"col_{i}" for i in range(n_col)]
seq = _create_sequence_from_ndarray(X, 1, 30)
seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
npy_list_ds = lgb.Dataset([X[:n_row // 2, :], X[n_row // 2:, :]],
feature_name=names, free_raw_data=False).construct()
npy_list_ds = lgb.Dataset(
[X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
).construct()
immergeable_dds = [seq_ds, npy_list_ds]
for x_1 in xxs:
# test that method works even with free_raw_data=True
......@@ -373,20 +376,19 @@ def test_add_features_from_different_sources():
d1.add_features_from(d2)
assert isinstance(d1.get_data(), original_type)
assert d1.get_data().shape == (n_row, n_col * idx)
res_feature_names += [f'D{idx}_{name}' for name in names]
res_feature_names += [f"D{idx}_{name}" for name in names]
assert d1.feature_name == res_feature_names
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
arr_a = np.zeros((100, 1), dtype=np.float32)
arr_b = np.random.normal(size=(100, 5))
dataset_a = lgb.Dataset(arr_a).construct()
expected_msg = (
'[LightGBM] [Warning] There are no meaningful features which satisfy '
'the provided configuration. Decreasing Dataset parameters min_data_in_bin '
'or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n'
"[LightGBM] [Warning] There are no meaningful features which satisfy "
"the provided configuration. Decreasing Dataset parameters min_data_in_bin "
"or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
)
log_lines = capsys.readouterr().out
assert expected_msg in log_lines
......@@ -404,7 +406,7 @@ def test_cegb_affects_behavior(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
base = lgb.Booster(train_set=ds)
......@@ -412,19 +414,21 @@ def test_cegb_affects_behavior(tmp_path):
base.update()
basename = tmp_path / "basename.txt"
base.save_model(basename)
with open(basename, 'rt') as f:
with open(basename, "rt") as f:
basetxt = f.read()
# Set extremely harsh penalties, so CEGB will block most splits.
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
{'cegb_penalty_split': 1}]
cases = [
{"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
{"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
{"cegb_penalty_split": 1},
]
for case in cases:
booster = lgb.Booster(train_set=ds, params=case)
for _ in range(10):
booster.update()
casename = tmp_path / "casename.txt"
booster.save_model(casename)
with open(casename, 'rt') as f:
with open(casename, "rt") as f:
casetxt = f.read()
assert basetxt != casetxt
......@@ -433,17 +437,22 @@ def test_cegb_scaling_equalities(tmp_path):
X = np.random.random((100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
names = [f'col_{i}' for i in range(5)]
names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
# Compare pairs of penalties, to ensure scaling works as intended
pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
{'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
{'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
({'cegb_penalty_split': 1},
{'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
for (p1, p2) in pairs:
pairs = [
(
{"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
{"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
),
(
{"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
{"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
),
({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
]
for p1, p2 in pairs:
booster1 = lgb.Booster(train_set=ds, params=p1)
booster2 = lgb.Booster(train_set=ds, params=p2)
for _ in range(10):
......@@ -453,32 +462,30 @@ def test_cegb_scaling_equalities(tmp_path):
# Reset booster1's parameters to p2, so the parameter section of the file matches.
booster1.reset_parameter(p2)
booster1.save_model(p1name)
with open(p1name, 'rt') as f:
with open(p1name, "rt") as f:
p1txt = f.read()
p2name = tmp_path / "p2.txt"
booster2.save_model(p2name)
with open(p2name, 'rt') as f:
with open(p2name, "rt") as f:
p2txt = f.read()
assert p1txt == p2txt
def test_consistent_state_for_dataset_fields():
def check_asserts(data):
np.testing.assert_allclose(data.label, data.get_label())
np.testing.assert_allclose(data.label, data.get_field('label'))
np.testing.assert_allclose(data.label, data.get_field("label"))
assert not np.isnan(data.label[0])
assert not np.isinf(data.label[1])
np.testing.assert_allclose(data.weight, data.get_weight())
np.testing.assert_allclose(data.weight, data.get_field('weight'))
np.testing.assert_allclose(data.weight, data.get_field("weight"))
assert not np.isnan(data.weight[0])
assert not np.isinf(data.weight[1])
np.testing.assert_allclose(data.init_score, data.get_init_score())
np.testing.assert_allclose(data.init_score, data.get_field('init_score'))
np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
assert not np.isnan(data.init_score[0])
assert not np.isinf(data.init_score[1])
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]],
data.label[0]))
assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
assert data.label[1] == pytest.approx(data.weight[1])
assert data.feature_name == data.get_feature_name()
......@@ -486,10 +493,8 @@ def test_consistent_state_for_dataset_fields():
sequence = np.ones(y.shape[0])
sequence[0] = np.nan
sequence[1] = np.inf
feature_names = [f'f{i}'for i in range(X.shape[1])]
lgb_data = lgb.Dataset(X, sequence,
weight=sequence, init_score=sequence,
feature_name=feature_names).construct()
feature_names = [f"f{i}" for i in range(X.shape[1])]
lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
check_asserts(lgb_data)
lgb_data = lgb.Dataset(X, y).construct()
lgb_data.set_label(sequence)
......@@ -500,20 +505,15 @@ def test_consistent_state_for_dataset_fields():
def test_dataset_construction_overwrites_user_provided_metadata_fields():
X = np.array([[1.0, 2.0], [3.0, 4.0]])
position = np.array([0.0, 1.0], dtype=np.float32)
if getenv('TASK', '') == 'cuda':
if getenv("TASK", "") == "cuda":
position = None
dtrain = lgb.Dataset(
X,
params={
"min_data_in_bin": 1,
"min_data_in_leaf": 1,
"verbosity": -1
},
params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
group=[1, 1],
init_score=[0.312, 0.708],
label=[1, 2],
......@@ -528,17 +528,9 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
assert dtrain.get_init_score() == [0.312, 0.708]
assert dtrain.label == [1, 2]
assert dtrain.get_label() == [1, 2]
if getenv('TASK', '') != 'cuda':
np_assert_array_equal(
dtrain.position,
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
np_assert_array_equal(
dtrain.get_position(),
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
if getenv("TASK", "") != "cuda":
np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
assert dtrain.weight == [0.5, 1.5]
assert dtrain.get_weight() == [0.5, 1.5]
......@@ -554,13 +546,11 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.group, expected_group, strict=True)
np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
# get_field("group") returns a numpy array with boundaries, instead of size
np_assert_array_equal(
dtrain.get_field("group"),
np.array([0, 1, 2], dtype=np.int32),
strict=True
)
np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
expected_init_score = np.array([0.312, 0.708],)
expected_init_score = np.array(
[0.312, 0.708],
)
np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)
......@@ -570,16 +560,12 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)
if getenv('TASK', '') != 'cuda':
if getenv("TASK", "") != "cuda":
expected_position = np.array([0.0, 1.0], dtype=np.float32)
np_assert_array_equal(dtrain.position, expected_position, strict=True)
np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
# NOTE: "position" is converted to int32 on the C++ side
np_assert_array_equal(
dtrain.get_field("position"),
np.array([0.0, 1.0], dtype=np.int32),
strict=True
)
np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
expected_weight = np.array([0.5, 1.5], dtype=np.float32)
np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
......@@ -588,7 +574,6 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
def test_choose_param_value():
original_params = {
"local_listen_port": 1234,
"port": 2222,
......@@ -599,30 +584,20 @@ def test_choose_param_value():
# should resolve duplicate aliases, and prefer the main parameter
params = lgb.basic._choose_param_value(
main_param_name="local_listen_port",
params=original_params,
default_value=5555
main_param_name="local_listen_port", params=original_params, default_value=5555
)
assert params["local_listen_port"] == 1234
assert "port" not in params
# should choose the highest priority alias and set that value on main param
# if only aliases are used
params = lgb.basic._choose_param_value(
main_param_name="num_iterations",
params=params,
default_value=17
)
params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
assert params["num_iterations"] == 13
assert "num_trees" not in params
assert "n_iter" not in params
# should use the default if main param and aliases are missing
params = lgb.basic._choose_param_value(
main_param_name="learning_rate",
params=params,
default_value=0.789
)
params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
assert params["learning_rate"] == 0.789
# all changes should be made on copies and not modify the original
......@@ -637,37 +612,23 @@ def test_choose_param_value():
def test_choose_param_value_preserves_nones():
# preserves None found for main param and still removes aliases
params = lgb.basic._choose_param_value(
main_param_name="num_threads",
params={
"num_threads": None,
"n_jobs": 4,
"objective": "regression"
},
default_value=2
params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
default_value=2,
)
assert params == {"num_threads": None, "objective": "regression"}
# correctly chooses value when only an alias is provided
params = lgb.basic._choose_param_value(
main_param_name="num_threads",
params={
"n_jobs": None,
"objective": "regression"
},
default_value=2
main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
)
assert params == {"num_threads": None, "objective": "regression"}
# adds None if that's given as the default and param not found
params = lgb.basic._choose_param_value(
main_param_name="min_data_in_leaf",
params={
"objective": "regression"
},
default_value=None
main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
)
assert params == {"objective": "regression", "min_data_in_leaf": None}
......@@ -676,51 +637,39 @@ def test_choose_param_value_preserves_nones():
def test_choose_param_value_objective(objective_alias):
# If callable is found in objective
params = {objective_alias: dummy_obj}
params = lgb.basic._choose_param_value(
main_param_name="objective",
params=params,
default_value=None
)
assert params['objective'] == dummy_obj
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
assert params["objective"] == dummy_obj
# Value in params should be preferred to the default_value passed from keyword arguments
params = {objective_alias: dummy_obj}
params = lgb.basic._choose_param_value(
main_param_name="objective",
params=params,
default_value=mse_obj
)
assert params['objective'] == dummy_obj
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
assert params["objective"] == dummy_obj
# None of objective or its aliases in params, but default_value is callable.
params = {}
params = lgb.basic._choose_param_value(
main_param_name="objective",
params=params,
default_value=mse_obj
)
assert params['objective'] == mse_obj
params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
assert params["objective"] == mse_obj
@pytest.mark.parametrize('collection', ['1d_np', '2d_np', 'pd_float', 'pd_str', '1d_list', '2d_list'])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_list_to_1d_numpy(collection, dtype):
collection2y = {
'1d_np': np.random.rand(10),
'2d_np': np.random.rand(10, 1),
'pd_float': np.random.rand(10),
'pd_str': ['a', 'b'],
'1d_list': [1] * 10,
'2d_list': [[1], [2]],
"1d_np": np.random.rand(10),
"2d_np": np.random.rand(10, 1),
"pd_float": np.random.rand(10),
"pd_str": ["a", "b"],
"1d_list": [1] * 10,
"2d_list": [[1], [2]],
}
y = collection2y[collection]
if collection.startswith('pd'):
if collection.startswith("pd"):
if not PANDAS_INSTALLED:
pytest.skip('pandas is not installed')
pytest.skip("pandas is not installed")
else:
y = pd_Series(y)
if isinstance(y, np.ndarray) and len(y.shape) == 2:
with pytest.warns(UserWarning, match='column-vector'):
with pytest.warns(UserWarning, match="column-vector"):
lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
return
elif isinstance(y, list) and isinstance(y[0], list):
......@@ -736,30 +685,31 @@ def test_list_to_1d_numpy(collection, dtype):
assert result.dtype == dtype
@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list'])
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
def test_init_score_for_multiclass_classification(init_score_type):
init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
if init_score_type == 'array':
if init_score_type == "array":
init_score = np.array(init_score)
elif init_score_type == 'dataframe':
elif init_score_type == "dataframe":
if not PANDAS_INSTALLED:
pytest.skip('Pandas is not installed.')
pytest.skip("Pandas is not installed.")
init_score = pd_DataFrame(init_score)
data = np.random.rand(10, 2)
ds = lgb.Dataset(data, init_score=init_score).construct()
np.testing.assert_equal(ds.get_field('init_score'), init_score)
np.testing.assert_equal(ds.get_field("init_score"), init_score)
np.testing.assert_equal(ds.init_score, init_score)
def test_smoke_custom_parser(tmp_path):
data_path = Path(__file__).absolute().parents[2] / 'examples' / 'binary_classification' / 'binary.train'
parser_config_file = tmp_path / 'parser.ini'
with open(parser_config_file, 'w') as fout:
data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
parser_config_file = tmp_path / "parser.ini"
with open(parser_config_file, "w") as fout:
fout.write('{"className": "dummy", "id": "1"}')
data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
with pytest.raises(lgb.basic.LightGBMError,
match="Cannot find parser class 'dummy', please register first or check config format"):
with pytest.raises(
lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
):
data.construct()
......@@ -770,9 +720,13 @@ def test_param_aliases():
assert all(isinstance(i, list) for i in aliases.values())
assert all(len(i) >= 1 for i in aliases.values())
assert all(k in v for k, v in aliases.items())
assert lgb.basic._ConfigAliases.get('config', 'task') == {'config', 'config_file', 'task', 'task_type'}
assert lgb.basic._ConfigAliases.get_sorted('min_data_in_leaf') == [
'min_data_in_leaf', 'min_data', 'min_samples_leaf', 'min_child_samples', 'min_data_per_leaf'
assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
"min_data_in_leaf",
"min_data",
"min_samples_leaf",
"min_child_samples",
"min_data_per_leaf",
]
......@@ -793,10 +747,10 @@ def test_custom_objective_safety():
y_multiclass = np.arange(nrows) % nclass
ds_binary = lgb.Dataset(X, y_binary).construct()
ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
bad_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
good_bst_binary = lgb.Booster({'objective': "none"}, ds_binary)
bad_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
good_bst_multi = lgb.Booster({'objective': "none", "num_class": nclass}, ds_multiclass)
bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
good_bst_binary.update(fobj=_good_gradients)
with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
bad_bst_binary.update(fobj=_bad_gradients)
......@@ -805,33 +759,30 @@ def test_custom_objective_safety():
bad_bst_multi.update(fobj=_bad_gradients)
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
@pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto'])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas')
pd = pytest.importorskip("pandas")
X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X)
built_data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)
@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
@pytest.mark.parametrize('categories', ['seen', 'unseen'])
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize("categories", ["seen", "unseen"])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
pd = pytest.importorskip('pandas')
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
if categories == 'seen':
pandas_categorical = [['a', 'b']]
pd = pytest.importorskip("pandas")
X = np.random.choice(["a", "b"], 100).reshape(-1, 1)
column_name = "a" if feature_name == "auto" else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
if categories == "seen":
pandas_categorical = [["a", "b"]]
else:
pandas_categorical = [['a']]
pandas_categorical = [["a"]]
data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
......@@ -841,31 +792,33 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
# check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes
if categories == 'seen':
if categories == "seen":
# if all categories were seen during training we just take the codes
codes = df[column_name].cat.codes
else:
# if we only saw 'a' during training we just replace its code
# and leave the rest as nan
a_code = df[column_name].cat.categories.get_loc('a')
codes = np.where(df[column_name] == 'a', a_code, np.nan)
a_code = df[column_name].cat.categories.get_loc("a")
codes = np.where(df[column_name] == "a", a_code, np.nan)
np.testing.assert_equal(codes, data[:, 0])
@pytest.mark.parametrize('min_data_in_bin', [2, 10])
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
def test_feature_num_bin(min_data_in_bin):
X = np.vstack([
np.random.rand(100),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
]).T
X = np.vstack(
[
np.random.rand(100),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
]
).T
n_continuous = X.shape[1] - 1
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
ds_kwargs = {
"params": {'min_data_in_bin': min_data_in_bin},
"params": {"min_data_in_bin": min_data_in_bin},
"categorical_feature": [n_continuous], # last feature
}
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
......@@ -884,7 +837,7 @@ def test_feature_num_bin(min_data_in_bin):
assert bins_by_name == expected_num_bins
# test using default feature names
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
default_names = [f'Column_{i}' for i in range(X.shape[1])]
default_names = [f"Column_{i}" for i in range(X.shape[1])]
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
assert bins_by_default_name == expected_num_bins
# check for feature indices outside of range
......@@ -892,9 +845,9 @@ def test_feature_num_bin(min_data_in_bin):
with pytest.raises(
lgb.basic.LightGBMError,
match=(
f'Tried to retrieve number of bins for feature index {num_features}, '
f'but the valid feature indices are \\[0, {num_features - 1}\\].'
)
f"Tried to retrieve number of bins for feature index {num_features}, "
f"but the valid feature indices are \\[0, {num_features - 1}\\]."
),
):
ds.feature_num_bin(num_features)
......@@ -902,7 +855,7 @@ def test_feature_num_bin(min_data_in_bin):
def test_feature_num_bin_with_max_bin_by_feature():
X = np.random.rand(100, 3)
max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
ds = lgb.Dataset(X, params={'max_bin_by_feature': max_bin_by_feature}).construct()
ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
......@@ -910,7 +863,7 @@ def test_feature_num_bin_with_max_bin_by_feature():
def test_set_leaf_output():
X, y = load_breast_cancer(return_X_y=True)
ds = lgb.Dataset(X, y)
bst = lgb.Booster({'num_leaves': 2}, ds)
bst = lgb.Booster({"num_leaves": 2}, ds)
bst.update()
y_pred = bst.predict(X)
for leaf_id in range(2):
......
......@@ -10,7 +10,7 @@ def reset_feature_fraction(boosting_round):
return 0.6 if boosting_round < 15 else 0.8
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_early_stopping_callback_is_picklable(serializer):
rounds = 5
callback = lgb.early_stopping(stopping_rounds=rounds)
......@@ -32,7 +32,7 @@ def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informativ
lgb.early_stopping(stopping_rounds="neverrrr")
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_log_evaluation_callback_is_picklable(serializer):
periods = 42
callback = lgb.log_evaluation(period=periods)
......@@ -43,7 +43,7 @@ def test_log_evaluation_callback_is_picklable(serializer):
assert callback.period == periods
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_record_evaluation_callback_is_picklable(serializer):
results = {}
callback = lgb.record_evaluation(eval_result=results)
......@@ -54,12 +54,9 @@ def test_record_evaluation_callback_is_picklable(serializer):
assert callback.eval_result is results
@pytest.mark.parametrize('serializer', SERIALIZERS)
@pytest.mark.parametrize("serializer", SERIALIZERS)
def test_reset_parameter_callback_is_picklable(serializer):
params = {
'bagging_fraction': [0.7] * 5 + [0.6] * 5,
'feature_fraction': reset_feature_fraction
}
params = {"bagging_fraction": [0.7] * 5 + [0.6] * 5, "feature_fraction": reset_feature_fraction}
callback = lgb.reset_parameter(**params)
callback_from_disk = pickle_and_unpickle_object(obj=callback, serializer=serializer)
assert callback_from_disk.order == 10
......
......@@ -6,22 +6,21 @@ from sklearn.datasets import load_svmlight_file
import lightgbm as lgb
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / 'examples'
EXAMPLES_DIR = Path(__file__).absolute().parents[2] / "examples"
class FileLoader:
def __init__(self, directory, prefix, config_file='train.conf'):
def __init__(self, directory, prefix, config_file="train.conf"):
self.directory = directory
self.prefix = prefix
self.params = {'gpu_use_dp': True}
with open(self.directory / config_file, 'r') as f:
self.params = {"gpu_use_dp": True}
with open(self.directory / config_file, "r") as f:
for line in f.readlines():
line = line.strip()
if line and not line.startswith('#'):
key, value = [token.strip() for token in line.split('=')]
if 'early_stopping' not in key: # disable early_stopping
self.params[key] = value if key not in {'num_trees', 'num_threads'} else int(value)
if line and not line.startswith("#"):
key, value = [token.strip() for token in line.split("=")]
if "early_stopping" not in key: # disable early_stopping
self.params[key] = value if key not in {"num_trees", "num_threads"} else int(value)
def load_dataset(self, suffix, is_sparse=False):
filename = str(self.path(suffix))
......@@ -33,14 +32,14 @@ class FileLoader:
return mat[:, 1:], mat[:, 0], filename
def load_field(self, suffix):
return np.loadtxt(str(self.directory / f'{self.prefix}{suffix}'))
return np.loadtxt(str(self.directory / f"{self.prefix}{suffix}"))
def load_cpp_result(self, result_file='LightGBM_predict_result.txt'):
def load_cpp_result(self, result_file="LightGBM_predict_result.txt"):
return np.loadtxt(str(self.directory / result_file))
def train_predict_check(self, lgb_train, X_test, X_test_fn, sk_pred):
params = dict(self.params)
params['force_row_wise'] = True
params["force_row_wise"] = True
gbm = lgb.train(params, lgb_train)
y_pred = gbm.predict(X_test)
cpp_pred = gbm.predict(X_test_fn)
......@@ -49,7 +48,7 @@ class FileLoader:
def file_load_check(self, lgb_train, name):
lgb_train_f = lgb.Dataset(self.path(name), params=self.params).construct()
for f in ('num_data', 'num_feature', 'get_label', 'get_weight', 'get_init_score', 'get_group'):
for f in ("num_data", "num_feature", "get_label", "get_weight", "get_init_score", "get_group"):
a = getattr(lgb_train, f)()
b = getattr(lgb_train_f, f)()
if a is None and b is None:
......@@ -62,83 +61,83 @@ class FileLoader:
assert a == b, f
def path(self, suffix):
return self.directory / f'{self.prefix}{suffix}'
return self.directory / f"{self.prefix}{suffix}"
def test_binary():
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
weight_train = fd.load_field('.train.weight')
fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
weight_train = fd.load_field(".train.weight")
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_binary_linear():
fd = FileLoader(EXAMPLES_DIR / 'binary_classification', 'binary', 'train_linear.conf')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
weight_train = fd.load_field('.train.weight')
fd = FileLoader(EXAMPLES_DIR / "binary_classification", "binary", "train_linear.conf")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
weight_train = fd.load_field(".train.weight")
lgb_train = lgb.Dataset(X_train, y_train, params=fd.params, weight=weight_train)
gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train, sample_weight=weight_train)
sk_pred = gbm.predict_proba(X_test)[:, 1]
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_multiclass():
fd = FileLoader(EXAMPLES_DIR / 'multiclass_classification', 'multiclass')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
fd = FileLoader(EXAMPLES_DIR / "multiclass_classification", "multiclass")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.LGBMClassifier(**fd.params)
gbm.fit(X_train, y_train)
sk_pred = gbm.predict_proba(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_regression():
fd = FileLoader(EXAMPLES_DIR / 'regression', 'regression')
X_train, y_train, _ = fd.load_dataset('.train')
X_test, _, X_test_fn = fd.load_dataset('.test')
init_score_train = fd.load_field('.train.init')
fd = FileLoader(EXAMPLES_DIR / "regression", "regression")
X_train, y_train, _ = fd.load_dataset(".train")
X_test, _, X_test_fn = fd.load_dataset(".test")
init_score_train = fd.load_field(".train.init")
lgb_train = lgb.Dataset(X_train, y_train, init_score=init_score_train)
gbm = lgb.LGBMRegressor(**fd.params)
gbm.fit(X_train, y_train, init_score=init_score_train)
sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_lambdarank():
fd = FileLoader(EXAMPLES_DIR / 'lambdarank', 'rank')
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
group_train = fd.load_field('.train.query')
fd = FileLoader(EXAMPLES_DIR / "lambdarank", "rank")
X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
group_train = fd.load_field(".train.query")
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
params = dict(fd.params)
params['force_col_wise'] = True
params["force_col_wise"] = True
gbm = lgb.LGBMRanker(**params)
gbm.fit(X_train, y_train, group=group_train)
sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
def test_xendcg():
fd = FileLoader(EXAMPLES_DIR / 'xendcg', 'rank')
X_train, y_train, _ = fd.load_dataset('.train', is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset('.test', is_sparse=True)
group_train = fd.load_field('.train.query')
fd = FileLoader(EXAMPLES_DIR / "xendcg", "rank")
X_train, y_train, _ = fd.load_dataset(".train", is_sparse=True)
X_test, _, X_test_fn = fd.load_dataset(".test", is_sparse=True)
group_train = fd.load_field(".train.query")
lgb_train = lgb.Dataset(X_train, y_train, group=group_train)
gbm = lgb.LGBMRanker(**fd.params)
gbm.fit(X_train, y_train, group=group_train)
sk_pred = gbm.predict(X_test)
fd.train_predict_check(lgb_train, X_test, X_test_fn, sk_pred)
fd.file_load_check(lgb_train, '.train')
fd.file_load_check(lgb_train, ".train")
......@@ -17,12 +17,12 @@ import lightgbm as lgb
from .utils import sklearn_multiclass_custom_objective
if not platform.startswith('linux'):
pytest.skip('lightgbm.dask is currently supported in Linux environments', allow_module_level=True)
if machine() != 'x86_64':
pytest.skip('lightgbm.dask tests are currently skipped on some architectures like arm64', allow_module_level=True)
if not platform.startswith("linux"):
pytest.skip("lightgbm.dask is currently supported in Linux environments", allow_module_level=True)
if machine() != "x86_64":
pytest.skip("lightgbm.dask tests are currently skipped on some architectures like arm64", allow_module_level=True)
if not lgb.compat.DASK_INSTALLED:
pytest.skip('Dask is not installed', allow_module_level=True)
pytest.skip("Dask is not installed", allow_module_level=True)
import dask.array as da
import dask.dataframe as dd
......@@ -37,46 +37,46 @@ from sklearn.datasets import make_blobs, make_regression
from .utils import make_ranking, pickle_obj, unpickle_obj
tasks = ['binary-classification', 'multiclass-classification', 'regression', 'ranking']
distributed_training_algorithms = ['data', 'voting']
data_output = ['array', 'scipy_csr_matrix', 'dataframe', 'dataframe-with-categorical']
boosting_types = ['gbdt', 'dart', 'goss', 'rf']
tasks = ["binary-classification", "multiclass-classification", "regression", "ranking"]
distributed_training_algorithms = ["data", "voting"]
data_output = ["array", "scipy_csr_matrix", "dataframe", "dataframe-with-categorical"]
boosting_types = ["gbdt", "dart", "goss", "rf"]
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
task_to_dask_factory = {
'regression': lgb.DaskLGBMRegressor,
'binary-classification': lgb.DaskLGBMClassifier,
'multiclass-classification': lgb.DaskLGBMClassifier,
'ranking': lgb.DaskLGBMRanker
"regression": lgb.DaskLGBMRegressor,
"binary-classification": lgb.DaskLGBMClassifier,
"multiclass-classification": lgb.DaskLGBMClassifier,
"ranking": lgb.DaskLGBMRanker,
}
task_to_local_factory = {
'regression': lgb.LGBMRegressor,
'binary-classification': lgb.LGBMClassifier,
'multiclass-classification': lgb.LGBMClassifier,
'ranking': lgb.LGBMRanker
"regression": lgb.LGBMRegressor,
"binary-classification": lgb.LGBMClassifier,
"multiclass-classification": lgb.LGBMClassifier,
"ranking": lgb.LGBMRanker,
}
pytestmark = [
pytest.mark.skipif(getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface'),
pytest.mark.skipif(getenv('TASK', '') == 'gpu', reason='Fails to run with GPU interface'),
pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Fails to run with CUDA interface')
pytest.mark.skipif(getenv("TASK", "") == "mpi", reason="Fails to run with MPI interface"),
pytest.mark.skipif(getenv("TASK", "") == "gpu", reason="Fails to run with GPU interface"),
pytest.mark.skipif(getenv("TASK", "") == "cuda", reason="Fails to run with CUDA interface"),
]
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def cluster():
dask_cluster = LocalCluster(n_workers=2, threads_per_worker=2, dashboard_address=None)
yield dask_cluster
dask_cluster.close()
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def cluster2():
dask_cluster = LocalCluster(n_workers=2, threads_per_worker=2, dashboard_address=None)
yield dask_cluster
dask_cluster.close()
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def cluster_three_workers():
dask_cluster = LocalCluster(n_workers=3, threads_per_worker=1, dashboard_address=None)
yield dask_cluster
......@@ -93,46 +93,43 @@ listen_port.port = 13000
def _get_workers_hostname(cluster: LocalCluster) -> str:
one_worker_address = next(iter(cluster.scheduler_info['workers']))
one_worker_address = next(iter(cluster.scheduler_info["workers"]))
return urlparse(one_worker_address).hostname
def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs):
def _create_ranking_data(n_samples=100, output="array", chunk_size=50, **kwargs):
X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs)
rnd = np.random.RandomState(42)
w = rnd.rand(X.shape[0]) * 0.01
g_rle = np.array([len(list(grp)) for _, grp in groupby(g)])
if output.startswith('dataframe'):
if output.startswith("dataframe"):
# add target, weight, and group to DataFrame so that partitions abide by group boundaries.
X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
if output == 'dataframe-with-categorical':
X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
if output == "dataframe-with-categorical":
for i in range(5):
col_name = f"cat_col{i}"
cat_values = rnd.choice(['a', 'b'], X.shape[0])
cat_series = pd.Series(
cat_values,
dtype='category'
)
cat_values = rnd.choice(["a", "b"], X.shape[0])
cat_series = pd.Series(cat_values, dtype="category")
X_df[col_name] = cat_series
X = X_df.copy()
X_df = X_df.assign(y=y, g=g, w=w)
# set_index ensures partitions are based on group id.
# See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function.
X_df.set_index('g', inplace=True)
X_df.set_index("g", inplace=True)
dX = dd.from_pandas(X_df, chunksize=chunk_size)
# separate target, weight from features.
dy = dX['y']
dw = dX['w']
dX = dX.drop(columns=['y', 'w'])
dy = dX["y"]
dw = dX["w"]
dX = dX.drop(columns=["y", "w"])
dg = dX.index.to_series()
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# so that within each partition, sum(g) = n_samples.
dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0]))
elif output == 'array':
dg = dg.map_partitions(lambda p: p.groupby("g", sort=False).apply(lambda z: z.shape[0]))
elif output == "array":
# ranking arrays: one chunk per group. Each chunk must include all columns.
p = X.shape[1]
dX, dy, dw, dg = [], [], [], []
......@@ -148,71 +145,63 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
dw = da.concatenate(dw, axis=0)
dg = da.concatenate(dg, axis=0)
else:
raise ValueError('Ranking data creation only supported for Dask arrays and dataframes')
raise ValueError("Ranking data creation only supported for Dask arrays and dataframes")
return X, y, w, g_rle, dX, dy, dw, dg
def _create_data(objective, n_samples=1_000, output='array', chunk_size=500, **kwargs):
if objective.endswith('classification'):
if objective == 'binary-classification':
def _create_data(objective, n_samples=1_000, output="array", chunk_size=500, **kwargs):
if objective.endswith("classification"):
if objective == "binary-classification":
centers = [[-4, -4], [4, 4]]
elif objective == 'multiclass-classification':
elif objective == "multiclass-classification":
centers = [[-4, -4], [4, 4], [-4, 4]]
else:
raise ValueError(f"Unknown classification task '{objective}'")
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42)
elif objective == 'regression':
elif objective == "regression":
X, y = make_regression(n_samples=n_samples, n_features=4, n_informative=2, random_state=42)
elif objective == 'ranking':
return _create_ranking_data(
n_samples=n_samples,
output=output,
chunk_size=chunk_size,
**kwargs
)
elif objective == "ranking":
return _create_ranking_data(n_samples=n_samples, output=output, chunk_size=chunk_size, **kwargs)
else:
raise ValueError(f"Unknown objective '{objective}'")
rnd = np.random.RandomState(42)
weights = rnd.random(X.shape[0]) * 0.01
if output == 'array':
if output == "array":
dX = da.from_array(X, (chunk_size, X.shape[1]))
dy = da.from_array(y, chunk_size)
dw = da.from_array(weights, chunk_size)
elif output.startswith('dataframe'):
X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
if output == 'dataframe-with-categorical':
elif output.startswith("dataframe"):
X_df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
if output == "dataframe-with-categorical":
num_cat_cols = 2
for i in range(num_cat_cols):
col_name = f"cat_col{i}"
cat_values = rnd.choice(['a', 'b'], X.shape[0])
cat_series = pd.Series(
cat_values,
dtype='category'
)
cat_values = rnd.choice(["a", "b"], X.shape[0])
cat_series = pd.Series(cat_values, dtype="category")
X_df[col_name] = cat_series
X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1)))
# make one categorical feature relevant to the target
cat_col_is_a = X_df['cat_col0'] == 'a'
if objective == 'regression':
cat_col_is_a = X_df["cat_col0"] == "a"
if objective == "regression":
y = np.where(cat_col_is_a, y, 2 * y)
elif objective == 'binary-classification':
elif objective == "binary-classification":
y = np.where(cat_col_is_a, y, 1 - y)
elif objective == 'multiclass-classification':
elif objective == "multiclass-classification":
n_classes = 3
y = np.where(cat_col_is_a, y, (1 + y) % n_classes)
y_df = pd.Series(y, name='target')
y_df = pd.Series(y, name="target")
dX = dd.from_pandas(X_df, chunksize=chunk_size)
dy = dd.from_pandas(y_df, chunksize=chunk_size)
dw = dd.from_array(weights, chunksize=chunk_size)
elif output == 'scipy_csr_matrix':
elif output == "scipy_csr_matrix":
dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix)
dy = da.from_array(y, chunks=chunk_size)
dw = da.from_array(weights, chunk_size)
X = csr_matrix(X)
elif output == 'scipy_csc_matrix':
elif output == "scipy_csc_matrix":
dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csc_matrix)
dy = da.from_array(y, chunks=chunk_size)
dw = da.from_array(weights, chunk_size)
......@@ -234,7 +223,7 @@ def _accuracy_score(dy_true, dy_pred):
def _constant_metric(y_true, y_pred):
metric_name = 'constant_metric'
metric_name = "constant_metric"
value = 0.708
is_higher_better = False
return metric_name, value, is_higher_better
......@@ -253,46 +242,32 @@ def _objective_logistic_regression(y_true, y_pred):
return grad, hess
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
@pytest.mark.parametrize('boosting_type', boosting_types)
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification"])
@pytest.mark.parametrize("boosting_type", boosting_types)
@pytest.mark.parametrize("tree_learner", distributed_training_algorithms)
def test_classifier(output, task, boosting_type, tree_learner, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task,
output=output
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective=task, output=output)
params = {"boosting_type": boosting_type, "tree_learner": tree_learner, "n_estimators": 50, "num_leaves": 31}
if boosting_type == "rf":
params.update(
{
"bagging_freq": 1,
"bagging_fraction": 0.9,
}
)
elif boosting_type == "goss":
params["top_rate"] = 0.5
params = {
"boosting_type": boosting_type,
"tree_learner": tree_learner,
"n_estimators": 50,
"num_leaves": 31
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
elif boosting_type == 'goss':
params['top_rate'] = 0.5
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
**params
)
dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, **params)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
p1 = dask_classifier.predict(dX)
p1_raw = dask_classifier.predict(dX, raw_score=True).compute()
p1_first_iter_raw = dask_classifier.predict(dX, start_iteration=0, num_iteration=1, raw_score=True).compute()
p1_early_stop_raw = dask_classifier.predict(
dX,
pred_early_stop=True,
pred_early_stop_margin=1.0,
pred_early_stop_freq=2,
raw_score=True
dX, pred_early_stop=True, pred_early_stop_margin=1.0, pred_early_stop_freq=2, raw_score=True
).compute()
p1_proba = dask_classifier.predict_proba(dX).compute()
p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True)
......@@ -306,7 +281,7 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
p2_proba = local_classifier.predict_proba(X)
s2 = local_classifier.score(X, y)
if boosting_type == 'rf':
if boosting_type == "rf":
# https://github.com/microsoft/LightGBM/issues/4118
assert_eq(s1, s2, atol=0.01)
assert_eq(p1_proba, p2_proba, atol=0.8)
......@@ -329,47 +304,30 @@ def test_classifier(output, task, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == (
X.shape[0],
dask_classifier.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert pred_leaf_vals.shape == (X.shape[0], dask_classifier.booster_.num_trees())
assert np.max(pred_leaf_vals) <= params["num_leaves"]
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"]
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical':
cat_cols = [
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
if output == "dataframe-with-categorical":
cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
tree_df = dask_classifier.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output + ['scipy_csc_matrix'])
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
@pytest.mark.parametrize("output", data_output + ["scipy_csc_matrix"])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification"])
def test_classifier_pred_contrib(output, task, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective=task,
output=output
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective=task, output=output)
params = {
"n_estimators": 10,
"num_leaves": 10
}
params = {"n_estimators": 10, "num_leaves": 10}
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, tree_learner="data", **params)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True)
......@@ -390,10 +348,10 @@ def test_classifier_pred_contrib(output, task, cluster):
#
# since that case is so different than all other cases, check the relevant things here
# and then return early
if output.startswith('scipy') and task == 'multiclass-classification':
if output == 'scipy_csr_matrix':
if output.startswith("scipy") and task == "multiclass-classification":
if output == "scipy_csr_matrix":
expected_type = csr_matrix
elif output == 'scipy_csc_matrix':
elif output == "scipy_csc_matrix":
expected_type = csc_matrix
else:
raise ValueError(f"Unrecognized output type: {output}")
......@@ -415,20 +373,17 @@ def test_classifier_pred_contrib(output, task, cluster):
return
preds_with_contrib = preds_with_contrib.compute()
if output.startswith('scipy'):
if output.startswith("scipy"):
preds_with_contrib = preds_with_contrib.toarray()
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical':
cat_cols = [
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
if output == "dataframe-with-categorical":
cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
tree_df = dask_classifier.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
# * shape depends on whether it is binary or multiclass classification
# * matrix for binary classification is of the form [feature_contrib, base_value],
......@@ -446,8 +401,8 @@ def test_classifier_pred_contrib(output, task, cluster):
assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification'])
@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification"])
def test_classifier_custom_objective(output, task, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
......@@ -461,25 +416,19 @@ def test_classifier_custom_objective(output, task, cluster):
"verbose": -1,
"seed": 708,
"deterministic": True,
"force_col_wise": True
"force_col_wise": True,
}
if task == 'binary-classification':
params.update({
'objective': _objective_logistic_regression,
})
elif task == 'multiclass-classification':
params.update({
'objective': sklearn_multiclass_custom_objective,
'num_classes': 3
})
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
tree_learner='data',
**params
)
if task == "binary-classification":
params.update(
{
"objective": _objective_logistic_regression,
}
)
elif task == "multiclass-classification":
params.update({"objective": sklearn_multiclass_custom_objective, "num_classes": 3})
dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, tree_learner="data", **params)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
dask_classifier_local = dask_classifier.to_local()
p1_raw = dask_classifier.predict(dX, raw_score=True).compute()
......@@ -490,14 +439,14 @@ def test_classifier_custom_objective(output, task, cluster):
p2_raw = local_classifier.predict(X, raw_score=True)
# with a custom objective, prediction result is a raw score instead of predicted class
if task == 'binary-classification':
if task == "binary-classification":
p1_proba = 1.0 / (1.0 + np.exp(-p1_raw))
p1_class = (p1_proba > 0.5).astype(np.int64)
p1_proba_local = 1.0 / (1.0 + np.exp(-p1_raw_local))
p1_class_local = (p1_proba_local > 0.5).astype(np.int64)
p2_proba = 1.0 / (1.0 + np.exp(-p2_raw))
p2_class = (p2_proba > 0.5).astype(np.int64)
elif task == 'multiclass-classification':
elif task == "multiclass-classification":
p1_proba = np.exp(p1_raw) / np.sum(np.exp(p1_raw), axis=1).reshape(-1, 1)
p1_class = p1_proba.argmax(axis=1)
p1_proba_local = np.exp(p1_raw_local) / np.sum(np.exp(p1_raw_local), axis=1).reshape(-1, 1)
......@@ -520,7 +469,7 @@ def test_classifier_custom_objective(output, task, cluster):
def test_machines_to_worker_map_unparseable_host_names():
workers = {'0.0.0.1:80': {}, '0.0.0.2:80': {}}
workers = {"0.0.0.1:80": {}, "0.0.0.2:80": {}}
machines = "0.0.0.1:80,0.0.0.2:80"
with pytest.raises(ValueError, match="Could not parse host name from worker address '0.0.0.1:80'"):
lgb.dask._machines_to_worker_map(machines=machines, worker_addresses=workers.keys())
......@@ -528,18 +477,13 @@ def test_machines_to_worker_map_unparseable_host_names():
def test_training_does_not_fail_on_port_conflicts(cluster):
with Client(cluster) as client:
_, _, _, _, dX, dy, dw, _ = _create_data('binary-classification', output='array')
_, _, _, _, dX, dy, dw, _ = _create_data("binary-classification", output="array")
lightgbm_default_port = 12400
workers_hostname = _get_workers_hostname(cluster)
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind((workers_hostname, lightgbm_default_port))
dask_classifier = lgb.DaskLGBMClassifier(
client=client,
time_out=5,
n_estimators=5,
num_leaves=5
)
dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, n_estimators=5, num_leaves=5)
for _ in range(5):
dask_classifier.fit(
X=dX,
......@@ -549,15 +493,12 @@ def test_training_does_not_fail_on_port_conflicts(cluster):
assert dask_classifier.booster_
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('boosting_type', boosting_types)
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("boosting_type", boosting_types)
@pytest.mark.parametrize("tree_learner", distributed_training_algorithms)
def test_regressor(output, boosting_type, tree_learner, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
params = {
"boosting_type": boosting_type,
......@@ -565,18 +506,15 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
"num_leaves": 31,
"n_estimators": 20,
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
if boosting_type == "rf":
params.update(
{
"bagging_freq": 1,
"bagging_fraction": 0.9,
}
)
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree=tree_learner,
**params
)
dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree=tree_learner, **params)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
p1 = dask_regressor.predict(dX)
p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)
......@@ -603,16 +541,13 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == (
X.shape[0],
dask_regressor.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert pred_leaf_vals.shape == (X.shape[0], dask_regressor.booster_.num_trees())
assert np.max(pred_leaf_vals) <= params["num_leaves"]
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"]
assert_eq(p1, y, rtol=0.5, atol=50.)
assert_eq(p2, y, rtol=0.5, atol=50.)
assert_eq(p1, y, rtol=0.5, atol=50.0)
assert_eq(p2, y, rtol=0.5, atol=50.0)
# extra predict() parameters should be passed through correctly
with pytest.raises(AssertionError):
......@@ -620,36 +555,22 @@ def test_regressor(output, boosting_type, tree_learner, cluster):
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical':
cat_cols = [
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
if output == "dataframe-with-categorical":
cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
tree_df = dask_regressor.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize("output", data_output)
def test_regressor_pred_contrib(output, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
params = {
"n_estimators": 10,
"num_leaves": 10
}
params = {"n_estimators": 10, "num_leaves": 10}
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner="data", **params)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute()
......@@ -668,39 +589,23 @@ def test_regressor_pred_contrib(output, cluster):
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical':
cat_cols = [
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
if output == "dataframe-with-categorical":
cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
tree_df = dask_regressor.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('alpha', [.1, .5, .9])
@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9])
def test_regressor_quantile(output, alpha, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
params = {
"objective": "quantile",
"alpha": alpha,
"random_state": 42,
"n_estimators": 10,
"num_leaves": 10
}
params = {"objective": "quantile", "alpha": alpha, "random_state": 42, "n_estimators": 10, "num_leaves": 10}
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
tree_learner_type='data_parallel',
**params
)
dask_regressor = lgb.DaskLGBMRegressor(client=client, tree_learner_type="data_parallel", **params)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
p1 = dask_regressor.predict(dX).compute()
q1 = np.count_nonzero(y < p1) / y.shape[0]
......@@ -716,37 +621,22 @@ def test_regressor_quantile(output, alpha, cluster):
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical':
cat_cols = [
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
if output == "dataframe-with-categorical":
cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
tree_df = dask_regressor.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize("output", data_output)
def test_regressor_custom_objective(output, cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output=output
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output=output)
params = {
"n_estimators": 10,
"num_leaves": 10,
"objective": _objective_least_squares
}
params = {"n_estimators": 10, "num_leaves": 10, "objective": _objective_least_squares}
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner='data',
**params
)
dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner="data", **params)
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
dask_regressor_local = dask_regressor.to_local()
p1 = dask_regressor.predict(dX)
......@@ -772,34 +662,26 @@ def test_regressor_custom_objective(output, cluster):
assert_eq(p1, p1_local)
# predictions should be better than random
assert_precision = {"rtol": 0.5, "atol": 50.}
assert_precision = {"rtol": 0.5, "atol": 50.0}
assert_eq(p1, y, **assert_precision)
assert_eq(p2, y, **assert_precision)
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize('group', [None, group_sizes])
@pytest.mark.parametrize('boosting_type', boosting_types)
@pytest.mark.parametrize('tree_learner', distributed_training_algorithms)
@pytest.mark.parametrize("output", ["array", "dataframe", "dataframe-with-categorical"])
@pytest.mark.parametrize("group", [None, group_sizes])
@pytest.mark.parametrize("boosting_type", boosting_types)
@pytest.mark.parametrize("tree_learner", distributed_training_algorithms)
def test_ranker(output, group, boosting_type, tree_learner, cluster):
with Client(cluster) as client:
if output == 'dataframe-with-categorical':
if output == "dataframe-with-categorical":
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
output=output,
group=group,
n_features=1,
n_informative=1
objective="ranking", output=output, group=group, n_features=1, n_informative=1
)
else:
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
output=output,
group=group
)
X, y, w, g, dX, dy, dw, dg = _create_data(objective="ranking", output=output, group=group)
# rebalance small dask.Array dataset for better performance.
if output == 'array':
if output == "array":
dX = dX.persist()
dy = dy.persist()
dw = dw.persist()
......@@ -814,20 +696,17 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
"random_state": 42,
"n_estimators": 50,
"num_leaves": 20,
"min_child_samples": 1
"min_child_samples": 1,
}
if boosting_type == 'rf':
params.update({
'bagging_freq': 1,
'bagging_fraction': 0.9,
})
dask_ranker = lgb.DaskLGBMRanker(
client=client,
time_out=5,
tree_learner_type=tree_learner,
**params
)
if boosting_type == "rf":
params.update(
{
"bagging_freq": 1,
"bagging_fraction": 0.9,
}
)
dask_ranker = lgb.DaskLGBMRanker(client=client, time_out=5, tree_learner_type=tree_learner, **params)
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
rnkvec_dask = dask_ranker.predict(dX)
rnkvec_dask = rnkvec_dask.compute()
......@@ -835,11 +714,7 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
p1_raw = dask_ranker.predict(dX, raw_score=True).compute()
p1_first_iter_raw = dask_ranker.predict(dX, start_iteration=0, num_iteration=1, raw_score=True).compute()
p1_early_stop_raw = dask_ranker.predict(
dX,
pred_early_stop=True,
pred_early_stop_margin=1.0,
pred_early_stop_freq=2,
raw_score=True
dX, pred_early_stop=True, pred_early_stop_margin=1.0, pred_early_stop_freq=2, raw_score=True
).compute()
rnkvec_dask_local = dask_ranker.to_local().predict(X)
......@@ -864,47 +739,33 @@ def test_ranker(output, group, boosting_type, tree_learner, cluster):
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == (
X.shape[0],
dask_ranker.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert pred_leaf_vals.shape == (X.shape[0], dask_ranker.booster_.num_trees())
assert np.max(pred_leaf_vals) <= params["num_leaves"]
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
assert len(np.unique(pred_leaf_vals)) <= params["num_leaves"]
# be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical':
cat_cols = [
col for col in dX.columns
if dX.dtypes[col].name == 'category'
]
if output == "dataframe-with-categorical":
cat_cols = [col for col in dX.columns if dX.dtypes[col].name == "category"]
tree_df = dask_ranker.booster_.trees_to_dataframe()
node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
node_uses_cat_col = tree_df["split_feature"].isin(cat_cols)
assert node_uses_cat_col.sum() > 0
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='
assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == "=="
@pytest.mark.parametrize('output', ['array', 'dataframe', 'dataframe-with-categorical'])
@pytest.mark.parametrize("output", ["array", "dataframe", "dataframe-with-categorical"])
def test_ranker_custom_objective(output, cluster):
with Client(cluster) as client:
if output == 'dataframe-with-categorical':
if output == "dataframe-with-categorical":
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
output=output,
group=group_sizes,
n_features=1,
n_informative=1
objective="ranking", output=output, group=group_sizes, n_features=1, n_informative=1
)
else:
X, y, w, g, dX, dy, dw, dg = _create_data(
objective='ranking',
output=output,
group=group_sizes
)
X, y, w, g, dX, dy, dw, dg = _create_data(objective="ranking", output=output, group=group_sizes)
# rebalance small dask.Array dataset for better performance.
if output == 'array':
if output == "array":
dX = dX.persist()
dy = dy.persist()
dw = dw.persist()
......@@ -917,15 +778,10 @@ def test_ranker_custom_objective(output, cluster):
"n_estimators": 50,
"num_leaves": 20,
"min_child_samples": 1,
"objective": _objective_least_squares
"objective": _objective_least_squares,
}
dask_ranker = lgb.DaskLGBMRanker(
client=client,
time_out=5,
tree_learner_type="data",
**params
)
dask_ranker = lgb.DaskLGBMRanker(client=client, time_out=5, tree_learner_type="data", **params)
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
rnkvec_dask = dask_ranker.predict(dX).compute()
dask_ranker_local = dask_ranker.to_local()
......@@ -946,13 +802,13 @@ def test_ranker_custom_objective(output, cluster):
assert callable(dask_ranker_local.objective_)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize('eval_sizes', [[0.5, 1, 1.5], [0]])
@pytest.mark.parametrize('eval_names_prefix', ['specified', None])
@pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize("output", data_output)
@pytest.mark.parametrize("eval_sizes", [[0.5, 1, 1.5], [0]])
@pytest.mark.parametrize("eval_names_prefix", ["specified", None])
def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, cluster):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster) as client:
# Use larger trainset to prevent premature stopping due to zero loss, causing num_trees() < n_estimators.
......@@ -966,36 +822,33 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_init_score = None
if eval_names_prefix:
eval_names = [f'{eval_names_prefix}_{i}' for i in range(len(eval_sizes))]
eval_names = [f"{eval_names_prefix}_{i}" for i in range(len(eval_sizes))]
else:
eval_names = None
X, y, w, g, dX, dy, dw, dg = _create_data(
objective=task,
n_samples=n_samples,
output=output,
chunk_size=chunk_size
objective=task, n_samples=n_samples, output=output, chunk_size=chunk_size
)
if task == 'ranking':
eval_metrics = ['ndcg']
if task == "ranking":
eval_metrics = ["ndcg"]
eval_at = (5, 6)
eval_metric_names = [f'ndcg@{k}' for k in eval_at]
eval_metric_names = [f"ndcg@{k}" for k in eval_at]
eval_group = []
else:
# test eval_class_weight, eval_init_score on binary-classification task.
# Note: objective's default `metric` will be evaluated in evals_result_ in addition to all eval_metrics.
if task == 'binary-classification':
eval_metrics = ['binary_error', 'auc']
eval_metric_names = ['binary_logloss', 'binary_error', 'auc']
if task == "binary-classification":
eval_metrics = ["binary_error", "auc"]
eval_metric_names = ["binary_logloss", "binary_error", "auc"]
eval_class_weight = []
eval_init_score = []
elif task == 'multiclass-classification':
eval_metrics = ['multi_error']
eval_metric_names = ['multi_logloss', 'multi_error']
elif task == 'regression':
eval_metrics = ['l1']
eval_metric_names = ['l2', 'l1']
elif task == "multiclass-classification":
eval_metrics = ["multi_error"]
eval_metric_names = ["multi_logloss", "multi_error"]
elif task == "regression":
eval_metrics = ["l1"]
eval_metric_names = ["l2", "l1"]
# create eval_sets by creating new datasets or copying training data.
for eval_size in eval_sizes:
......@@ -1008,23 +861,20 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
else:
n_eval_samples = max(chunk_size, int(n_samples * eval_size))
_, y_e, _, _, dX_e, dy_e, dw_e, dg_e = _create_data(
objective=task,
n_samples=n_eval_samples,
output=output,
chunk_size=chunk_size
objective=task, n_samples=n_eval_samples, output=output, chunk_size=chunk_size
)
eval_set.append((dX_e, dy_e))
eval_sample_weight.append(dw_e)
if task == 'ranking':
if task == "ranking":
eval_group.append(dg_e)
if task == 'binary-classification':
if task == "binary-classification":
n_neg = np.sum(y_e == 0)
n_pos = np.sum(y_e == 1)
eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg})
init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e)))
if 'dataframe' in output:
if "dataframe" in output:
d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size))
else:
d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size))
......@@ -1032,44 +882,36 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
eval_init_score.append(d_init_score)
fit_trees = 50
params = {
"random_state": 42,
"n_estimators": fit_trees,
"num_leaves": 2
}
params = {"random_state": 42, "n_estimators": fit_trees, "num_leaves": 2}
model_factory = task_to_dask_factory[task]
dask_model = model_factory(
client=client,
**params
)
dask_model = model_factory(client=client, **params)
fit_params = {
'X': dX,
'y': dy,
'eval_set': eval_set,
'eval_names': eval_names,
'eval_sample_weight': eval_sample_weight,
'eval_init_score': eval_init_score,
'eval_metric': eval_metrics
"X": dX,
"y": dy,
"eval_set": eval_set,
"eval_names": eval_names,
"eval_sample_weight": eval_sample_weight,
"eval_init_score": eval_init_score,
"eval_metric": eval_metrics,
}
if task == 'ranking':
fit_params.update(
{'group': dg,
'eval_group': eval_group,
'eval_at': eval_at}
)
elif task == 'binary-classification':
fit_params.update({'eval_class_weight': eval_class_weight})
if task == "ranking":
fit_params.update({"group": dg, "eval_group": eval_group, "eval_at": eval_at})
elif task == "binary-classification":
fit_params.update({"eval_class_weight": eval_class_weight})
if eval_sizes == [0]:
with pytest.warns(UserWarning, match='Worker (.*) was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable.'):
with pytest.warns(
UserWarning,
match="Worker (.*) was not allocated eval_set data. Therefore evals_result_ and best_score_ data may be unreliable.",
):
dask_model.fit(**fit_params)
else:
dask_model = dask_model.fit(**fit_params)
# total number of trees scales up for ova classifier.
if task == 'multiclass-classification':
if task == "multiclass-classification":
model_trees = fit_trees * dask_model.n_classes_
else:
model_trees = fit_trees
......@@ -1098,67 +940,45 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
assert len(evals_result[eval_name][metric]) == fit_trees
@pytest.mark.parametrize('task', ['binary-classification', 'regression', 'ranking'])
@pytest.mark.parametrize("task", ["binary-classification", "regression", "ranking"])
def test_eval_set_with_custom_eval_metric(task, cluster):
with Client(cluster) as client:
n_samples = 1000
n_eval_samples = int(n_samples * 0.5)
chunk_size = 10
output = 'array'
output = "array"
X, y, w, g, dX, dy, dw, dg = _create_data(
objective=task,
n_samples=n_samples,
output=output,
chunk_size=chunk_size
objective=task, n_samples=n_samples, output=output, chunk_size=chunk_size
)
_, _, _, _, dX_e, dy_e, _, dg_e = _create_data(
objective=task,
n_samples=n_eval_samples,
output=output,
chunk_size=chunk_size
objective=task, n_samples=n_eval_samples, output=output, chunk_size=chunk_size
)
if task == 'ranking':
if task == "ranking":
eval_at = (5, 6)
eval_metrics = ['ndcg', _constant_metric]
eval_metric_names = [f'ndcg@{k}' for k in eval_at] + ['constant_metric']
elif task == 'binary-classification':
eval_metrics = ['binary_error', 'auc', _constant_metric]
eval_metric_names = ['binary_logloss', 'binary_error', 'auc', 'constant_metric']
eval_metrics = ["ndcg", _constant_metric]
eval_metric_names = [f"ndcg@{k}" for k in eval_at] + ["constant_metric"]
elif task == "binary-classification":
eval_metrics = ["binary_error", "auc", _constant_metric]
eval_metric_names = ["binary_logloss", "binary_error", "auc", "constant_metric"]
else:
eval_metrics = ['l1', _constant_metric]
eval_metric_names = ['l2', 'l1', 'constant_metric']
eval_metrics = ["l1", _constant_metric]
eval_metric_names = ["l2", "l1", "constant_metric"]
fit_trees = 50
params = {
"random_state": 42,
"n_estimators": fit_trees,
"num_leaves": 2
}
params = {"random_state": 42, "n_estimators": fit_trees, "num_leaves": 2}
model_factory = task_to_dask_factory[task]
dask_model = model_factory(
client=client,
**params
)
dask_model = model_factory(client=client, **params)
eval_set = [(dX_e, dy_e)]
fit_params = {
'X': dX,
'y': dy,
'eval_set': eval_set,
'eval_metric': eval_metrics
}
if task == 'ranking':
fit_params.update(
{'group': dg,
'eval_group': [dg_e],
'eval_at': eval_at}
)
fit_params = {"X": dX, "y": dy, "eval_set": eval_set, "eval_metric": eval_metrics}
if task == "ranking":
fit_params.update({"group": dg, "eval_group": [dg_e], "eval_at": eval_at})
dask_model = dask_model.fit(**fit_params)
eval_name = 'valid_0'
eval_name = "valid_0"
evals_result = dask_model.evals_result_
assert len(evals_result) == 1
assert eval_name in evals_result
......@@ -1167,29 +987,21 @@ def test_eval_set_with_custom_eval_metric(task, cluster):
assert metric in evals_result[eval_name]
assert len(evals_result[eval_name][metric]) == fit_trees
np.testing.assert_allclose(evals_result[eval_name]['constant_metric'], 0.708)
np.testing.assert_allclose(evals_result[eval_name]["constant_metric"], 0.708)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize("task", tasks)
def test_training_works_if_client_not_provided_or_set_after_construction(task, cluster):
with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data(
objective=task,
output='array',
group=None
)
_, _, _, _, dX, dy, _, dg = _create_data(objective=task, output="array", group=None)
model_factory = task_to_dask_factory[task]
params = {
"time_out": 5,
"n_estimators": 1,
"num_leaves": 2
}
params = {"time_out": 5, "n_estimators": 1, "num_leaves": 2}
# should be able to use the class without specifying a client
dask_model = model_factory(**params)
assert dask_model.client is None
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'):
with pytest.raises(lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"):
dask_model.client_
dask_model.fit(dX, dy, group=dg)
......@@ -1213,7 +1025,7 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
dask_model.set_params(client=client)
assert dask_model.client == client
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'):
with pytest.raises(lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"):
dask_model.client_
dask_model.fit(dX, dy, group=dg)
......@@ -1233,34 +1045,23 @@ def test_training_works_if_client_not_provided_or_set_after_construction(task, c
local_model.client_
@pytest.mark.parametrize('serializer', ['pickle', 'joblib', 'cloudpickle'])
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('set_client', [True, False])
def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(serializer, task, set_client, tmp_path, cluster, cluster2):
@pytest.mark.parametrize("serializer", ["pickle", "joblib", "cloudpickle"])
@pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize("set_client", [True, False])
def test_model_and_local_version_are_picklable_whether_or_not_client_set_explicitly(
serializer, task, set_client, tmp_path, cluster, cluster2
):
with Client(cluster) as client1:
# data on cluster1
X_1, _, _, _, dX_1, dy_1, _, dg_1 = _create_data(
objective=task,
output='array',
group=None
)
X_1, _, _, _, dX_1, dy_1, _, dg_1 = _create_data(objective=task, output="array", group=None)
with Client(cluster2) as client2:
# create identical data on cluster2
X_2, _, _, _, dX_2, dy_2, _, dg_2 = _create_data(
objective=task,
output='array',
group=None
)
X_2, _, _, _, dX_2, dy_2, _, dg_2 = _create_data(objective=task, output="array", group=None)
model_factory = task_to_dask_factory[task]
params = {
"time_out": 5,
"n_estimators": 1,
"num_leaves": 2
}
params = {"time_out": 5, "n_estimators": 1, "num_leaves": 2}
# at this point, the result of default_client() is client2 since it was the most recently
# created. So setting client to client1 here to test that you can select a non-default client
......@@ -1277,33 +1078,21 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
else:
assert dask_model.client is None
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'):
with pytest.raises(
lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"
):
dask_model.client_
assert "client" not in local_model.get_params()
assert getattr(local_model, "client", None) is None
tmp_file = tmp_path / "model-1.pkl"
pickle_obj(
obj=dask_model,
filepath=tmp_file,
serializer=serializer
)
model_from_disk = unpickle_obj(
filepath=tmp_file,
serializer=serializer
)
pickle_obj(obj=dask_model, filepath=tmp_file, serializer=serializer)
model_from_disk = unpickle_obj(filepath=tmp_file, serializer=serializer)
local_tmp_file = tmp_path / "local-model-1.pkl"
pickle_obj(
obj=local_model,
filepath=local_tmp_file,
serializer=serializer
)
local_model_from_disk = unpickle_obj(
filepath=local_tmp_file,
serializer=serializer
)
pickle_obj(obj=local_model, filepath=local_tmp_file, serializer=serializer)
local_model_from_disk = unpickle_obj(filepath=local_tmp_file, serializer=serializer)
assert model_from_disk.client is None
......@@ -1312,7 +1101,9 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
else:
assert dask_model.client is None
with pytest.raises(lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'):
with pytest.raises(
lgb.compat.LGBMNotFittedError, match="Cannot access property client_ before calling fit"
):
dask_model.client_
# client will always be None after unpickling
......@@ -1340,26 +1131,12 @@ def test_model_and_local_version_are_picklable_whether_or_not_client_set_explici
local_model.client_
tmp_file2 = tmp_path / "model-2.pkl"
pickle_obj(
obj=dask_model,
filepath=tmp_file2,
serializer=serializer
)
fitted_model_from_disk = unpickle_obj(
filepath=tmp_file2,
serializer=serializer
)
pickle_obj(obj=dask_model, filepath=tmp_file2, serializer=serializer)
fitted_model_from_disk = unpickle_obj(filepath=tmp_file2, serializer=serializer)
local_tmp_file2 = tmp_path / "local-model-2.pkl"
pickle_obj(
obj=local_model,
filepath=local_tmp_file2,
serializer=serializer
)
local_fitted_model_from_disk = unpickle_obj(
filepath=local_tmp_file2,
serializer=serializer
)
pickle_obj(obj=local_model, filepath=local_tmp_file2, serializer=serializer)
local_fitted_model_from_disk = unpickle_obj(filepath=local_tmp_file2, serializer=serializer)
if set_client:
assert dask_model.client == client1
......@@ -1405,35 +1182,25 @@ def test_warns_and_continues_on_unrecognized_tree_learner(cluster):
X = da.random.random((1e3, 10))
y = da.random.random((1e3, 1))
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner='some-nonsense-value',
n_estimators=1,
num_leaves=2
client=client, time_out=5, tree_learner="some-nonsense-value", n_estimators=1, num_leaves=2
)
with pytest.warns(UserWarning, match='Parameter tree_learner set to some-nonsense-value'):
with pytest.warns(UserWarning, match="Parameter tree_learner set to some-nonsense-value"):
dask_regressor = dask_regressor.fit(X, y)
assert dask_regressor.fitted_
@pytest.mark.parametrize('tree_learner', ['data_parallel', 'voting_parallel'])
@pytest.mark.parametrize("tree_learner", ["data_parallel", "voting_parallel"])
def test_training_respects_tree_learner_aliases(tree_learner, cluster):
with Client(cluster) as client:
task = 'regression'
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output='array')
task = "regression"
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output="array")
dask_factory = task_to_dask_factory[task]
dask_model = dask_factory(
client=client,
tree_learner=tree_learner,
time_out=5,
n_estimators=10,
num_leaves=15
)
dask_model = dask_factory(client=client, tree_learner=tree_learner, time_out=5, n_estimators=10, num_leaves=15)
dask_model.fit(dX, dy, sample_weight=dw, group=dg)
assert dask_model.fitted_
assert dask_model.get_params()['tree_learner'] == tree_learner
assert dask_model.get_params()["tree_learner"] == tree_learner
def test_error_on_feature_parallel_tree_learner(cluster):
......@@ -1444,39 +1211,30 @@ def test_error_on_feature_parallel_tree_learner(cluster):
_ = wait([X, y])
client.rebalance()
dask_regressor = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
tree_learner='feature_parallel',
n_estimators=1,
num_leaves=2
client=client, time_out=5, tree_learner="feature_parallel", n_estimators=1, num_leaves=2
)
with pytest.raises(lgb.basic.LightGBMError, match='Do not support feature parallel in c api'):
with pytest.raises(lgb.basic.LightGBMError, match="Do not support feature parallel in c api"):
dask_regressor = dask_regressor.fit(X, y)
def test_errors(cluster):
with Client(cluster) as client:
def f(part):
raise Exception('foo')
raise Exception("foo")
df = dd.demo.make_timeseries()
df = df.map_partitions(f, meta=df._meta)
with pytest.raises(Exception) as info:
lgb.dask._train(
client=client,
data=df,
label=df.x,
params={},
model_factory=lgb.LGBMClassifier
)
assert 'foo' in str(info.value)
lgb.dask._train(client=client, data=df, label=df.x, params={}, model_factory=lgb.LGBMClassifier)
assert "foo" in str(info.value)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize("output", data_output)
def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, output, cluster_three_workers):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster_three_workers) as client:
_, y, _, _, dX, dy, dw, dg = _create_data(
......@@ -1489,7 +1247,7 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
dask_model_factory = task_to_dask_factory[task]
workers = list(client.scheduler_info()['workers'].keys())
workers = list(client.scheduler_info()["workers"].keys())
assert len(workers) == 3
first_two_workers = workers[:2]
......@@ -1506,33 +1264,28 @@ def test_training_succeeds_even_if_some_workers_do_not_have_any_data(task, outpu
assert len(workers_with_data) == 2
params = {
'time_out': 5,
'random_state': 42,
'num_leaves': 10,
'n_estimators': 20,
"time_out": 5,
"random_state": 42,
"num_leaves": 10,
"n_estimators": 20,
}
dask_model = dask_model_factory(tree='data', client=client, **params)
dask_model = dask_model_factory(tree="data", client=client, **params)
dask_model.fit(dX, dy, group=dg, sample_weight=dw)
dask_preds = dask_model.predict(dX).compute()
if task == 'regression':
if task == "regression":
score = r2_score(y, dask_preds)
elif task.endswith('classification'):
elif task.endswith("classification"):
score = accuracy_score(y, dask_preds)
else:
score = spearmanr(dask_preds, y).correlation
assert score > 0.9
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize("task", tasks)
def test_network_params_not_required_but_respected_if_given(task, listen_port, cluster):
with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data(
objective=task,
output='array',
chunk_size=10,
group=None
)
_, _, _, _, dX, dy, _, dg = _create_data(objective=task, output="array", chunk_size=10, group=None)
dask_model_factory = task_to_dask_factory[task]
......@@ -1547,11 +1300,11 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
dask_model1.fit(dX, dy, group=dg)
assert dask_model1.fitted_
params = dask_model1.get_params()
assert 'local_listen_port' not in params
assert 'machines' not in params
assert "local_listen_port" not in params
assert "machines" not in params
# model 2 - machines given
workers = list(client.scheduler_info()['workers'])
workers = list(client.scheduler_info()["workers"])
workers_hostname = _get_workers_hostname(cluster)
remote_sockets, open_ports = lgb.dask._assign_open_ports_to_workers(client, workers)
for s in remote_sockets.values():
......@@ -1559,58 +1312,43 @@ def test_network_params_not_required_but_respected_if_given(task, listen_port, c
dask_model2 = dask_model_factory(
n_estimators=5,
num_leaves=5,
machines=",".join([
f"{workers_hostname}:{port}"
for port in open_ports.values()
]),
machines=",".join([f"{workers_hostname}:{port}" for port in open_ports.values()]),
)
dask_model2.fit(dX, dy, group=dg)
assert dask_model2.fitted_
params = dask_model2.get_params()
assert 'local_listen_port' not in params
assert 'machines' in params
assert "local_listen_port" not in params
assert "machines" in params
# model 3 - local_listen_port given
# training should fail because LightGBM will try to use the same
# port for multiple worker processes on the same machine
dask_model3 = dask_model_factory(
n_estimators=5,
num_leaves=5,
local_listen_port=listen_port
)
dask_model3 = dask_model_factory(n_estimators=5, num_leaves=5, local_listen_port=listen_port)
error_msg = "has multiple Dask worker processes running on it"
with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
dask_model3.fit(dX, dy, group=dg)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize("task", tasks)
def test_machines_should_be_used_if_provided(task, cluster):
pytest.skip("skipping due to timeout issues discussed in https://github.com/microsoft/LightGBM/issues/5390")
with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data(
objective=task,
output='array',
chunk_size=10,
group=None
)
_, _, _, _, dX, dy, _, dg = _create_data(objective=task, output="array", chunk_size=10, group=None)
dask_model_factory = task_to_dask_factory[task]
# rebalance data to be sure that each worker has a piece of the data
client.rebalance()
n_workers = len(client.scheduler_info()['workers'])
n_workers = len(client.scheduler_info()["workers"])
assert n_workers > 1
workers_hostname = _get_workers_hostname(cluster)
open_ports = lgb.dask._find_n_open_ports(n_workers)
dask_model = dask_model_factory(
n_estimators=5,
num_leaves=5,
machines=",".join([
f"{workers_hostname}:{port}"
for port in open_ports
]),
machines=",".join([f"{workers_hostname}:{port}" for port in open_ports]),
)
# test that "machines" is actually respected by creating a socket that uses
......@@ -1626,12 +1364,7 @@ def test_machines_should_be_used_if_provided(task, cluster):
# an informative error should be raised if "machines" has duplicates
one_open_port = lgb.dask._find_n_open_ports(1)
dask_model.set_params(
machines=",".join([
f"127.0.0.1:{one_open_port}"
for _ in range(n_workers)
])
)
dask_model.set_params(machines=",".join([f"127.0.0.1:{one_open_port}" for _ in range(n_workers)]))
with pytest.raises(ValueError, match="Found duplicates in 'machines'"):
dask_model.fit(dX, dy, group=dg)
......@@ -1641,8 +1374,8 @@ def test_machines_should_be_used_if_provided(task, cluster):
[
(lgb.DaskLGBMClassifier, lgb.LGBMClassifier),
(lgb.DaskLGBMRegressor, lgb.LGBMRegressor),
(lgb.DaskLGBMRanker, lgb.LGBMRanker)
]
(lgb.DaskLGBMRanker, lgb.LGBMRanker),
],
)
def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg(classes):
dask_spec = inspect.getfullargspec(classes[0])
......@@ -1655,7 +1388,7 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
# "client" should be the only different, and the final argument
assert dask_spec.args[:-1] == sklearn_spec.args
assert dask_spec.defaults[:-1] == sklearn_spec.defaults
assert dask_spec.args[-1] == 'client'
assert dask_spec.args[-1] == "client"
assert dask_spec.defaults[-1] is None
......@@ -1668,18 +1401,18 @@ def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except
(lgb.DaskLGBMRegressor.fit, lgb.LGBMRegressor.fit),
(lgb.DaskLGBMRegressor.predict, lgb.LGBMRegressor.predict),
(lgb.DaskLGBMRanker.fit, lgb.LGBMRanker.fit),
(lgb.DaskLGBMRanker.predict, lgb.LGBMRanker.predict)
]
(lgb.DaskLGBMRanker.predict, lgb.LGBMRanker.predict),
],
)
def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
dask_spec = inspect.getfullargspec(methods[0])
sklearn_spec = inspect.getfullargspec(methods[1])
dask_params = inspect.signature(methods[0]).parameters
sklearn_params = inspect.signature(methods[1]).parameters
assert dask_spec.args == sklearn_spec.args[:len(dask_spec.args)]
assert dask_spec.args == sklearn_spec.args[: len(dask_spec.args)]
assert dask_spec.varargs == sklearn_spec.varargs
if sklearn_spec.varkw:
assert dask_spec.varkw == sklearn_spec.varkw[:len(dask_spec.varkw)]
assert dask_spec.varkw == sklearn_spec.varkw[: len(dask_spec.varkw)]
assert dask_spec.kwonlyargs == sklearn_spec.kwonlyargs
assert dask_spec.kwonlydefaults == sklearn_spec.kwonlydefaults
for param in dask_spec.args:
......@@ -1687,14 +1420,10 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
assert dask_params[param].default == sklearn_params[param].default, error_msg
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize("task", tasks)
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task, cluster):
with Client(cluster):
_, _, _, _, dX, dy, dw, dg = _create_data(
objective=task,
output='dataframe',
group=None
)
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output="dataframe", group=None)
model_factory = task_to_dask_factory[task]
......@@ -1702,58 +1431,41 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
dy_col_array = dy.reshape(-1, 1)
assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1
params = {
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0,
'time_out': 5
}
params = {"n_estimators": 1, "num_leaves": 3, "random_state": 0, "time_out": 5}
model = model_factory(**params)
model.fit(dX, dy_col_array, sample_weight=dw, group=dg)
assert model.fitted_
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize("output", data_output)
def test_init_score(task, output, cluster):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster) as client:
_, _, _, _, dX, dy, dw, dg = _create_data(
objective=task,
output=output,
group=None
)
_, _, _, _, dX, dy, dw, dg = _create_data(objective=task, output=output, group=None)
model_factory = task_to_dask_factory[task]
params = {
'n_estimators': 1,
'num_leaves': 2,
'time_out': 5
}
params = {"n_estimators": 1, "num_leaves": 2, "time_out": 5}
init_score = random.random()
size_factor = 1
if task == 'multiclass-classification':
if task == "multiclass-classification":
size_factor = 3 # number of classes
if output.startswith('dataframe'):
if output.startswith("dataframe"):
init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size))
else:
init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set
assert model.booster_.trees_to_dataframe()['value'][0] == 0
assert model.booster_.trees_to_dataframe()["value"][0] == 0
def sklearn_checks_to_run():
check_names = [
"check_estimator_get_tags_default_keys",
"check_get_params_invariance",
"check_set_params"
]
check_names = ["check_estimator_get_tags_default_keys", "check_get_params_invariance", "check_set_params"]
for check_name in check_names:
check_func = getattr(sklearn_checks, check_name, None)
if check_func:
......@@ -1782,79 +1494,58 @@ def test_parameters_default_constructible(estimator):
sklearn_checks.check_parameters_default_constructible(name, Estimator)
@pytest.mark.parametrize('task', tasks)
@pytest.mark.parametrize('output', data_output)
@pytest.mark.parametrize("task", tasks)
@pytest.mark.parametrize("output", data_output)
def test_predict_with_raw_score(task, output, cluster):
if task == 'ranking' and output == 'scipy_csr_matrix':
pytest.skip('LGBMRanker is not currently tested on sparse matrices')
if task == "ranking" and output == "scipy_csr_matrix":
pytest.skip("LGBMRanker is not currently tested on sparse matrices")
with Client(cluster) as client:
_, _, _, _, dX, dy, _, dg = _create_data(
objective=task,
output=output,
group=None
)
_, _, _, _, dX, dy, _, dg = _create_data(objective=task, output=output, group=None)
model_factory = task_to_dask_factory[task]
params = {
'client': client,
'n_estimators': 1,
'num_leaves': 2,
'time_out': 5,
'min_sum_hessian': 0
}
params = {"client": client, "n_estimators": 1, "num_leaves": 2, "time_out": 5, "min_sum_hessian": 0}
model = model_factory(**params)
model.fit(dX, dy, group=dg)
raw_predictions = model.predict(dX, raw_score=True).compute()
trees_df = model.booster_.trees_to_dataframe()
leaves_df = trees_df[trees_df.node_depth == 2]
if task == 'multiclass-classification':
if task == "multiclass-classification":
for i in range(model.n_classes_):
class_df = leaves_df[leaves_df.tree_index == i]
assert set(raw_predictions[:, i]) == set(class_df['value'])
assert set(raw_predictions[:, i]) == set(class_df["value"])
else:
assert set(raw_predictions) == set(leaves_df['value'])
assert set(raw_predictions) == set(leaves_df["value"])
if task.endswith('classification'):
if task.endswith("classification"):
pred_proba_raw = model.predict_proba(dX, raw_score=True).compute()
assert_eq(raw_predictions, pred_proba_raw)
def test_distributed_quantized_training(cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output='array'
)
X, y, w, _, dX, dy, dw, _ = _create_data(objective="regression", output="array")
np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f")
params = {
"boosting_type": 'gbdt',
"boosting_type": "gbdt",
"n_estimators": 50,
"num_leaves": 31,
'use_quantized_grad': True,
'num_grad_quant_bins': 30,
'quant_train_renew_leaf': True,
'verbose': -1,
"use_quantized_grad": True,
"num_grad_quant_bins": 30,
"quant_train_renew_leaf": True,
"verbose": -1,
}
quant_dask_classifier = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
**params
)
quant_dask_classifier = lgb.DaskLGBMRegressor(client=client, time_out=5, **params)
quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw)
quant_p1 = quant_dask_classifier.predict(dX)
quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2))
params["use_quantized_grad"] = False
dask_classifier = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
**params
)
dask_classifier = lgb.DaskLGBMRegressor(client=client, time_out=5, **params)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
p1 = dask_classifier.predict(dX)
rmse = np.sqrt(np.mean((p1.compute() - y) ** 2))
......
......@@ -28,7 +28,7 @@ def test_cpu_and_gpu_work():
params_gpu = params_cpu.copy()
params_gpu["device"] = "gpu"
# Double-precision floats are only supported on x86_64 with PoCL
params_gpu["gpu_use_dp"] = (platform.machine() == "x86_64")
params_gpu["gpu_use_dp"] = platform.machine() == "x86_64"
gpu_bst = lgb.train(params_gpu, data, num_boost_round=10)
gpu_score = log_loss(y, gpu_bst.predict(X))
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -9,7 +9,8 @@ from lightgbm.compat import GRAPHVIZ_INSTALLED, MATPLOTLIB_INSTALLED, PANDAS_INS
if MATPLOTLIB_INSTALLED:
import matplotlib
matplotlib.use('Agg')
matplotlib.use("Agg")
if GRAPHVIZ_INSTALLED:
import graphviz
......@@ -18,8 +19,7 @@ from .utils import load_breast_cancer, make_synthetic_regression
@pytest.fixture(scope="module")
def breast_cancer_split():
return train_test_split(*load_breast_cancer(return_X_y=True),
test_size=0.1, random_state=1)
return train_test_split(*load_breast_cancer(return_X_y=True), test_size=0.1, random_state=1)
def _categorical_data(category_values_lower_bound, category_values_upper_bound):
......@@ -41,51 +41,51 @@ def train_data(breast_cancer_split):
@pytest.fixture
def params():
return {"objective": "binary",
"verbose": -1,
"num_leaves": 3}
return {"objective": "binary", "verbose": -1, "num_leaves": 3}
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_importance(params, breast_cancer_split, train_data):
X_train, _, y_train, _ = breast_cancer_split
gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_importance(gbm0)
assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Feature importance'
assert ax0.get_xlabel() == 'Feature importance'
assert ax0.get_ylabel() == 'Features'
assert ax0.get_title() == "Feature importance"
assert ax0.get_xlabel() == "Feature importance"
assert ax0.get_ylabel() == "Features"
assert len(ax0.patches) <= 30
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
gbm1.fit(X_train, y_train)
ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
ax1 = lgb.plot_importance(gbm1, color="r", title="t", xlabel="x", ylabel="y")
assert isinstance(ax1, matplotlib.axes.Axes)
assert ax1.get_title() == 't'
assert ax1.get_xlabel() == 'x'
assert ax1.get_ylabel() == 'y'
assert ax1.get_title() == "t"
assert ax1.get_xlabel() == "x"
assert ax1.get_ylabel() == "y"
assert len(ax1.patches) <= 30
for patch in ax1.patches:
assert patch.get_facecolor() == (1., 0, 0, 1.) # red
assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'], title=None, xlabel=None, ylabel=None)
ax2 = lgb.plot_importance(gbm0, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None)
assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == ''
assert ax2.get_xlabel() == ''
assert ax2.get_ylabel() == ''
assert ax2.get_title() == ""
assert ax2.get_xlabel() == ""
assert ax2.get_ylabel() == ""
assert len(ax2.patches) <= 30
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b
assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
ax3 = lgb.plot_importance(gbm0, title='t @importance_type@', xlabel='x @importance_type@', ylabel='y @importance_type@')
ax3 = lgb.plot_importance(
gbm0, title="t @importance_type@", xlabel="x @importance_type@", ylabel="y @importance_type@"
)
assert isinstance(ax3, matplotlib.axes.Axes)
assert ax3.get_title() == 't @importance_type@'
assert ax3.get_xlabel() == 'x split'
assert ax3.get_ylabel() == 'y @importance_type@'
assert ax3.get_title() == "t @importance_type@"
assert ax3.get_xlabel() == "x split"
assert ax3.get_ylabel() == "y @importance_type@"
assert len(ax3.patches) <= 30
gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1, importance_type="gain")
......@@ -108,51 +108,59 @@ def test_plot_importance(params, breast_cancer_split, train_data):
assert first_bar1 != first_bar3
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_split_value_histogram(params, breast_cancer_split, train_data):
X_train, _, y_train, _ = breast_cancer_split
gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_split_value_histogram(gbm0, 27)
assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Split value histogram for feature with index 27'
assert ax0.get_xlabel() == 'Feature split value'
assert ax0.get_ylabel() == 'Count'
assert ax0.get_title() == "Split value histogram for feature with index 27"
assert ax0.get_xlabel() == "Feature split value"
assert ax0.get_ylabel() == "Count"
assert len(ax0.patches) <= 2
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
gbm1.fit(X_train, y_train)
ax1 = lgb.plot_split_value_histogram(gbm1, gbm1.booster_.feature_name()[27], figsize=(10, 5),
title='Histogram for feature @index/name@ @feature@',
xlabel='x', ylabel='y', color='r')
ax1 = lgb.plot_split_value_histogram(
gbm1,
gbm1.booster_.feature_name()[27],
figsize=(10, 5),
title="Histogram for feature @index/name@ @feature@",
xlabel="x",
ylabel="y",
color="r",
)
assert isinstance(ax1, matplotlib.axes.Axes)
title = f'Histogram for feature name {gbm1.booster_.feature_name()[27]}'
title = f"Histogram for feature name {gbm1.booster_.feature_name()[27]}"
assert ax1.get_title() == title
assert ax1.get_xlabel() == 'x'
assert ax1.get_ylabel() == 'y'
assert ax1.get_xlabel() == "x"
assert ax1.get_ylabel() == "y"
assert len(ax1.patches) <= 2
for patch in ax1.patches:
assert patch.get_facecolor() == (1., 0, 0, 1.) # red
assert patch.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax2 = lgb.plot_split_value_histogram(gbm0, 27, bins=10, color=['r', 'y', 'g', 'b'],
title=None, xlabel=None, ylabel=None)
ax2 = lgb.plot_split_value_histogram(
gbm0, 27, bins=10, color=["r", "y", "g", "b"], title=None, xlabel=None, ylabel=None
)
assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == ''
assert ax2.get_xlabel() == ''
assert ax2.get_ylabel() == ''
assert ax2.get_title() == ""
assert ax2.get_xlabel() == ""
assert ax2.get_ylabel() == ""
assert len(ax2.patches) == 10
assert ax2.patches[0].get_facecolor() == (1., 0, 0, 1.) # r
assert ax2.patches[1].get_facecolor() == (.75, .75, 0, 1.) # y
assert ax2.patches[2].get_facecolor() == (0, .5, 0, 1.) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1., 1.) # b
assert ax2.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # r
assert ax2.patches[1].get_facecolor() == (0.75, 0.75, 0, 1.0) # y
assert ax2.patches[2].get_facecolor() == (0, 0.5, 0, 1.0) # g
assert ax2.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # b
with pytest.raises(ValueError):
lgb.plot_split_value_histogram(gbm0, 0) # was not used in splitting
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED,
reason='matplotlib or graphviz is not installed')
@pytest.mark.skipif(
not MATPLOTLIB_INSTALLED or not GRAPHVIZ_INSTALLED, reason="matplotlib or graphviz is not installed"
)
def test_plot_tree(breast_cancer_split):
X_train, _, y_train, _ = breast_cancer_split
gbm = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, verbose=-1)
......@@ -161,14 +169,14 @@ def test_plot_tree(breast_cancer_split):
with pytest.raises(IndexError):
lgb.plot_tree(gbm, tree_index=83)
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=['split_gain'])
ax = lgb.plot_tree(gbm, tree_index=3, figsize=(15, 8), show_info=["split_gain"])
assert isinstance(ax, matplotlib.axes.Axes)
w, h = ax.axes.get_figure().get_size_inches()
assert int(w) == 15
assert int(h) == 8
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_create_tree_digraph(breast_cancer_split):
X_train, _, y_train, _ = breast_cancer_split
......@@ -179,28 +187,32 @@ def test_create_tree_digraph(breast_cancer_split):
with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=3,
show_info=['split_gain', 'internal_value', 'internal_weight'],
name='Tree4', node_attr={'color': 'red'})
graph = lgb.create_tree_digraph(
gbm,
tree_index=3,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
)
graph.render(view=False)
assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4'
assert graph.name == "Tree4"
assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red'
assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body)
assert 'leaf' in graph_body
assert 'gain' in graph_body
assert 'value' in graph_body
assert 'weight' in graph_body
assert '#ffdddd' in graph_body
assert '#ddffdd' in graph_body
assert 'data' not in graph_body
assert 'count' not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
graph_body = "".join(graph.body)
assert "leaf" in graph_body
assert "gain" in graph_body
assert "value" in graph_body
assert "weight" in graph_body
assert "#ffdddd" in graph_body
assert "#ddffdd" in graph_body
assert "data" not in graph_body
assert "count" not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_tree_with_categories_below_max_category_values():
X_train, y_train = _categorical_data(2, 10)
params = {
......@@ -211,7 +223,7 @@ def test_tree_with_categories_below_max_category_values():
"deterministic": True,
"num_threads": 1,
"seed": 708,
"verbose": -1
"verbose": -1,
}
gbm = lgb.LGBMClassifier(**params)
gbm.fit(X_train, y_train)
......@@ -219,28 +231,32 @@ def test_tree_with_categories_below_max_category_values():
with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=3,
show_info=['split_gain', 'internal_value', 'internal_weight'],
name='Tree4', node_attr={'color': 'red'},
max_category_values=10)
graph = lgb.create_tree_digraph(
gbm,
tree_index=3,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
max_category_values=10,
)
graph.render(view=False)
assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4'
assert graph.name == "Tree4"
assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red'
assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body)
assert 'leaf' in graph_body
assert 'gain' in graph_body
assert 'value' in graph_body
assert 'weight' in graph_body
assert 'data' not in graph_body
assert 'count' not in graph_body
assert '||...||' not in graph_body
graph_body = "".join(graph.body)
assert "leaf" in graph_body
assert "gain" in graph_body
assert "value" in graph_body
assert "weight" in graph_body
assert "data" not in graph_body
assert "count" not in graph_body
assert "||...||" not in graph_body
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_tree_with_categories_above_max_category_values():
X_train, y_train = _categorical_data(20, 30)
params = {
......@@ -251,7 +267,7 @@ def test_tree_with_categories_above_max_category_values():
"deterministic": True,
"num_threads": 1,
"seed": 708,
"verbose": -1
"verbose": -1,
}
gbm = lgb.LGBMClassifier(**params)
gbm.fit(X_train, y_train)
......@@ -259,32 +275,36 @@ def test_tree_with_categories_above_max_category_values():
with pytest.raises(IndexError):
lgb.create_tree_digraph(gbm, tree_index=83)
graph = lgb.create_tree_digraph(gbm, tree_index=9,
show_info=['split_gain', 'internal_value', 'internal_weight'],
name='Tree4', node_attr={'color': 'red'},
max_category_values=4)
graph = lgb.create_tree_digraph(
gbm,
tree_index=9,
show_info=["split_gain", "internal_value", "internal_weight"],
name="Tree4",
node_attr={"color": "red"},
max_category_values=4,
)
graph.render(view=False)
assert isinstance(graph, graphviz.Digraph)
assert graph.name == 'Tree4'
assert graph.name == "Tree4"
assert len(graph.node_attr) == 1
assert graph.node_attr['color'] == 'red'
assert graph.node_attr["color"] == "red"
assert len(graph.graph_attr) == 0
assert len(graph.edge_attr) == 0
graph_body = ''.join(graph.body)
assert 'leaf' in graph_body
assert 'gain' in graph_body
assert 'value' in graph_body
assert 'weight' in graph_body
assert 'data' not in graph_body
assert 'count' not in graph_body
assert '||...||' in graph_body
@pytest.mark.parametrize('use_missing', [True, False])
@pytest.mark.parametrize('zero_as_missing', [True, False])
graph_body = "".join(graph.body)
assert "leaf" in graph_body
assert "gain" in graph_body
assert "value" in graph_body
assert "weight" in graph_body
assert "data" not in graph_body
assert "count" not in graph_body
assert "||...||" in graph_body
@pytest.mark.parametrize("use_missing", [True, False])
@pytest.mark.parametrize("zero_as_missing", [True, False])
def test_numeric_split_direction(use_missing, zero_as_missing):
if use_missing and zero_as_missing:
pytest.skip('use_missing and zero_as_missing both set to True')
pytest.skip("use_missing and zero_as_missing both set to True")
X, y = make_synthetic_regression()
rng = np.random.RandomState(0)
zero_mask = rng.rand(X.shape[0]) < 0.05
......@@ -294,48 +314,48 @@ def test_numeric_split_direction(use_missing, zero_as_missing):
X[nan_mask, :] = np.nan
ds = lgb.Dataset(X, y)
params = {
'num_leaves': 127,
'min_child_samples': 1,
'use_missing': use_missing,
'zero_as_missing': zero_as_missing,
"num_leaves": 127,
"min_child_samples": 1,
"use_missing": use_missing,
"zero_as_missing": zero_as_missing,
}
bst = lgb.train(params, ds, num_boost_round=1)
case_with_zero = X[zero_mask][[0]]
expected_leaf_zero = bst.predict(case_with_zero, pred_leaf=True)[0]
node = bst.dump_model()['tree_info'][0]['tree_structure']
while 'decision_type' in node:
node = bst.dump_model()["tree_info"][0]["tree_structure"]
while "decision_type" in node:
direction = lgb.plotting._determine_direction_for_numeric_split(
case_with_zero[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
case_with_zero[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
)
node = node['left_child'] if direction == 'left' else node['right_child']
assert node['leaf_index'] == expected_leaf_zero
node = node["left_child"] if direction == "left" else node["right_child"]
assert node["leaf_index"] == expected_leaf_zero
if use_missing:
case_with_nan = X[nan_mask][[0]]
expected_leaf_nan = bst.predict(case_with_nan, pred_leaf=True)[0]
node = bst.dump_model()['tree_info'][0]['tree_structure']
while 'decision_type' in node:
node = bst.dump_model()["tree_info"][0]["tree_structure"]
while "decision_type" in node:
direction = lgb.plotting._determine_direction_for_numeric_split(
case_with_nan[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left']
case_with_nan[0][node["split_feature"]], node["threshold"], node["missing_type"], node["default_left"]
)
node = node['left_child'] if direction == 'left' else node['right_child']
assert node['leaf_index'] == expected_leaf_nan
node = node["left_child"] if direction == "left" else node["right_child"]
assert node["leaf_index"] == expected_leaf_nan
assert expected_leaf_zero != expected_leaf_nan
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
def test_example_case_in_tree_digraph():
rng = np.random.RandomState(0)
x1 = rng.rand(100)
cat = rng.randint(1, 3, size=x1.size)
X = np.vstack([x1, cat]).T
y = x1 + 2 * cat
feature_name = ['x1', 'cat']
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=['cat'])
feature_name = ["x1", "cat"]
ds = lgb.Dataset(X, y, feature_name=feature_name, categorical_feature=["cat"])
num_round = 3
bst = lgb.train({'num_leaves': 7}, ds, num_boost_round=num_round)
bst = lgb.train({"num_leaves": 7}, ds, num_boost_round=num_round)
mod = bst.dump_model()
example_case = X[[0]]
makes_categorical_splits = False
......@@ -343,42 +363,46 @@ def test_example_case_in_tree_digraph():
for i in range(num_round):
graph = lgb.create_tree_digraph(bst, example_case=example_case, tree_index=i)
gbody = graph.body
node = mod['tree_info'][i]['tree_structure']
while 'decision_type' in node: # iterate through the splits
split_index = node['split_index']
node = mod["tree_info"][i]["tree_structure"]
while "decision_type" in node: # iterate through the splits
split_index = node["split_index"]
node_in_graph = [n for n in gbody if f'split{split_index}' in n and '->' not in n]
node_in_graph = [n for n in gbody if f"split{split_index}" in n and "->" not in n]
assert len(node_in_graph) == 1
seen_indices.add(gbody.index(node_in_graph[0]))
edge_to_node = [e for e in gbody if f'-> split{split_index}' in e]
if node['decision_type'] == '<=':
edge_to_node = [e for e in gbody if f"-> split{split_index}" in e]
if node["decision_type"] == "<=":
direction = lgb.plotting._determine_direction_for_numeric_split(
example_case[0][node['split_feature']], node['threshold'], node['missing_type'], node['default_left'])
example_case[0][node["split_feature"]],
node["threshold"],
node["missing_type"],
node["default_left"],
)
else:
makes_categorical_splits = True
direction = lgb.plotting._determine_direction_for_categorical_split(
example_case[0][node['split_feature']], node['threshold']
example_case[0][node["split_feature"]], node["threshold"]
)
node = node['left_child'] if direction == 'left' else node['right_child']
assert 'color=blue' in node_in_graph[0]
node = node["left_child"] if direction == "left" else node["right_child"]
assert "color=blue" in node_in_graph[0]
if edge_to_node:
assert len(edge_to_node) == 1
assert 'color=blue' in edge_to_node[0]
assert "color=blue" in edge_to_node[0]
seen_indices.add(gbody.index(edge_to_node[0]))
# we're in a leaf now
leaf_index = node['leaf_index']
leaf_in_graph = [n for n in gbody if f'leaf{leaf_index}' in n and '->' not in n]
edge_to_leaf = [e for e in gbody if f'-> leaf{leaf_index}' in e]
leaf_index = node["leaf_index"]
leaf_in_graph = [n for n in gbody if f"leaf{leaf_index}" in n and "->" not in n]
edge_to_leaf = [e for e in gbody if f"-> leaf{leaf_index}" in e]
assert len(leaf_in_graph) == 1
assert 'color=blue' in leaf_in_graph[0]
assert "color=blue" in leaf_in_graph[0]
assert len(edge_to_leaf) == 1
assert 'color=blue' in edge_to_leaf[0]
assert "color=blue" in edge_to_leaf[0]
seen_indices.update([gbody.index(leaf_in_graph[0]), gbody.index(edge_to_leaf[0])])
# check that the rest of the elements have black color
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and 'graph' not in e]
assert all('color=black' in e for e in remaining_elements)
remaining_elements = [e for i, e in enumerate(graph.body) if i not in seen_indices and "graph" not in e]
assert all("color=black" in e for e in remaining_elements)
# check that we got to the expected leaf
expected_leaf = bst.predict(example_case, start_iteration=i, num_iteration=1, pred_leaf=True)[0]
......@@ -386,83 +410,86 @@ def test_example_case_in_tree_digraph():
assert makes_categorical_splits
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason='graphviz is not installed')
@pytest.mark.parametrize('input_type', ['array', 'dataframe'])
@pytest.mark.skipif(not GRAPHVIZ_INSTALLED, reason="graphviz is not installed")
@pytest.mark.parametrize("input_type", ["array", "dataframe"])
def test_empty_example_case_on_tree_digraph_raises_error(input_type):
X, y = make_synthetic_regression()
if input_type == 'dataframe':
if input_type == "dataframe":
if not PANDAS_INSTALLED:
pytest.skip(reason='pandas is not installed')
pytest.skip(reason="pandas is not installed")
X = pd_DataFrame(X)
ds = lgb.Dataset(X, y)
bst = lgb.train({'num_leaves': 3}, ds, num_boost_round=1)
bst = lgb.train({"num_leaves": 3}, ds, num_boost_round=1)
example_case = X[:0]
if input_type == 'dataframe':
if input_type == "dataframe":
example_case = pd_DataFrame(example_case)
with pytest.raises(ValueError, match='example_case must have a single row.'):
with pytest.raises(ValueError, match="example_case must have a single row."):
lgb.create_tree_digraph(bst, tree_index=0, example_case=example_case)
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason='matplotlib is not installed')
@pytest.mark.skipif(not MATPLOTLIB_INSTALLED, reason="matplotlib is not installed")
def test_plot_metrics(params, breast_cancer_split, train_data):
X_train, X_test, y_train, y_test = breast_cancer_split
test_data = lgb.Dataset(X_test, y_test, reference=train_data)
params.update({"metric": {"binary_logloss", "binary_error"}})
evals_result0 = {}
lgb.train(params, train_data,
valid_sets=[train_data, test_data],
valid_names=['v1', 'v2'],
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result0)])
lgb.train(
params,
train_data,
valid_sets=[train_data, test_data],
valid_names=["v1", "v2"],
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result0)],
)
with pytest.warns(UserWarning, match="More than one metric available, picking one to plot."):
ax0 = lgb.plot_metric(evals_result0)
assert isinstance(ax0, matplotlib.axes.Axes)
assert ax0.get_title() == 'Metric during training'
assert ax0.get_xlabel() == 'Iterations'
assert ax0.get_ylabel() in {'binary_logloss', 'binary_error'}
assert ax0.get_title() == "Metric during training"
assert ax0.get_xlabel() == "Iterations"
assert ax0.get_ylabel() in {"binary_logloss", "binary_error"}
legend_items = ax0.get_legend().get_texts()
assert len(legend_items) == 2
assert legend_items[0].get_text() == 'v1'
assert legend_items[1].get_text() == 'v2'
assert legend_items[0].get_text() == "v1"
assert legend_items[1].get_text() == "v2"
ax1 = lgb.plot_metric(evals_result0, metric='binary_error')
ax1 = lgb.plot_metric(evals_result0, metric="binary_error")
assert isinstance(ax1, matplotlib.axes.Axes)
assert ax1.get_title() == 'Metric during training'
assert ax1.get_xlabel() == 'Iterations'
assert ax1.get_ylabel() == 'binary_error'
assert ax1.get_title() == "Metric during training"
assert ax1.get_xlabel() == "Iterations"
assert ax1.get_ylabel() == "binary_error"
legend_items = ax1.get_legend().get_texts()
assert len(legend_items) == 2
assert legend_items[0].get_text() == 'v1'
assert legend_items[1].get_text() == 'v2'
assert legend_items[0].get_text() == "v1"
assert legend_items[1].get_text() == "v2"
ax2 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])
ax2 = lgb.plot_metric(evals_result0, metric="binary_logloss", dataset_names=["v2"])
assert isinstance(ax2, matplotlib.axes.Axes)
assert ax2.get_title() == 'Metric during training'
assert ax2.get_xlabel() == 'Iterations'
assert ax2.get_ylabel() == 'binary_logloss'
assert ax2.get_title() == "Metric during training"
assert ax2.get_xlabel() == "Iterations"
assert ax2.get_ylabel() == "binary_logloss"
legend_items = ax2.get_legend().get_texts()
assert len(legend_items) == 1
assert legend_items[0].get_text() == 'v2'
assert legend_items[0].get_text() == "v2"
ax3 = lgb.plot_metric(
evals_result0,
metric='binary_logloss',
dataset_names=['v1'],
title='Metric @metric@',
xlabel='Iterations @metric@',
metric="binary_logloss",
dataset_names=["v1"],
title="Metric @metric@",
xlabel="Iterations @metric@",
ylabel='Value of "@metric@"',
figsize=(5, 5),
dpi=600,
grid=False
grid=False,
)
assert isinstance(ax3, matplotlib.axes.Axes)
assert ax3.get_title() == 'Metric @metric@'
assert ax3.get_xlabel() == 'Iterations @metric@'
assert ax3.get_title() == "Metric @metric@"
assert ax3.get_xlabel() == "Iterations @metric@"
assert ax3.get_ylabel() == 'Value of "binary_logloss"'
legend_items = ax3.get_legend().get_texts()
assert len(legend_items) == 1
assert legend_items[0].get_text() == 'v1'
assert legend_items[0].get_text() == "v1"
assert ax3.get_figure().get_figheight() == 5
assert ax3.get_figure().get_figwidth() == 5
assert ax3.get_figure().get_dpi() == 600
......@@ -472,9 +499,7 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
assert not grid_line.get_visible()
evals_result1 = {}
lgb.train(params, train_data,
num_boost_round=10,
callbacks=[lgb.record_evaluation(evals_result1)])
lgb.train(params, train_data, num_boost_round=10, callbacks=[lgb.record_evaluation(evals_result1)])
with pytest.raises(ValueError, match="eval results cannot be empty."):
lgb.plot_metric(evals_result1)
......@@ -482,9 +507,9 @@ def test_plot_metrics(params, breast_cancer_split, train_data):
gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)])
ax4 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
assert isinstance(ax4, matplotlib.axes.Axes)
assert ax4.get_title() == ''
assert ax4.get_xlabel() == ''
assert ax4.get_ylabel() == ''
assert ax4.get_title() == ""
assert ax4.get_xlabel() == ""
assert ax4.get_ylabel() == ""
legend_items = ax4.get_legend().get_texts()
assert len(legend_items) == 1
assert legend_items[0].get_text() == 'valid_0'
assert legend_items[0].get_text() == "valid_0"
......@@ -23,32 +23,40 @@ from sklearn.utils.validation import check_is_fitted
import lightgbm as lgb
from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression,
sklearn_multiclass_custom_objective, softmax)
from .utils import (
load_breast_cancer,
load_digits,
load_iris,
load_linnerud,
make_ranking,
make_synthetic_regression,
sklearn_multiclass_custom_objective,
softmax,
)
decreasing_generator = itertools.count(0, -1)
task_to_model_factory = {
'ranking': lgb.LGBMRanker,
'binary-classification': lgb.LGBMClassifier,
'multiclass-classification': lgb.LGBMClassifier,
'regression': lgb.LGBMRegressor,
"ranking": lgb.LGBMRanker,
"binary-classification": lgb.LGBMClassifier,
"multiclass-classification": lgb.LGBMClassifier,
"regression": lgb.LGBMRegressor,
}
def _create_data(task, n_samples=100, n_features=4):
if task == 'ranking':
if task == "ranking":
X, y, g = make_ranking(n_features=4, n_samples=n_samples)
g = np.bincount(g)
elif task.endswith('classification'):
if task == 'binary-classification':
elif task.endswith("classification"):
if task == "binary-classification":
centers = 2
elif task == 'multiclass-classification':
elif task == "multiclass-classification":
centers = 3
else:
ValueError(f"Unknown classification task '{task}'")
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42)
g = None
elif task == 'regression':
elif task == "regression":
X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
g = None
return X, y, g
......@@ -70,7 +78,7 @@ def custom_asymmetric_obj(y_true, y_pred):
def objective_ls(y_true, y_pred):
grad = (y_pred - y_true)
grad = y_pred - y_true
hess = np.ones(len(y_true))
return grad, hess
......@@ -87,15 +95,15 @@ def custom_dummy_obj(y_true, y_pred):
def constant_metric(y_true, y_pred):
return 'error', 0, False
return "error", 0, False
def decreasing_metric(y_true, y_pred):
return ('decreasing_metric', next(decreasing_generator), False)
return ("decreasing_metric", next(decreasing_generator), False)
def mse(y_true, y_pred):
return 'custom MSE', mean_squared_error(y_true, y_pred), False
return "custom MSE", mean_squared_error(y_true, y_pred), False
def binary_error(y_true, y_pred):
......@@ -117,7 +125,7 @@ def test_binary():
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
ret = log_loss(y_test, gbm.predict_proba(X_test))
assert ret < 0.12
assert gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret)
assert gbm.evals_result_["valid_0"]["binary_logloss"][gbm.best_iteration_ - 1] == pytest.approx(ret)
def test_regression():
......@@ -127,10 +135,12 @@ def test_regression():
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
ret = mean_squared_error(y_test, gbm.predict(X_test))
assert ret < 174
assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret)
assert gbm.evals_result_["valid_0"]["l2"][gbm.best_iteration_ - 1] == pytest.approx(ret)
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_multiclass():
X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......@@ -140,16 +150,18 @@ def test_multiclass():
assert ret < 0.05
ret = multi_logloss(y_test, gbm.predict_proba(X_test))
assert ret < 0.16
assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret)
assert gbm.evals_result_["valid_0"]["multi_logloss"][gbm.best_iteration_ - 1] == pytest.approx(ret)
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_lambdarank():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test'))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query'))
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test"))
q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
q_test = np.loadtxt(str(rank_example_dir / "rank.test.query"))
gbm = lgb.LGBMRanker(n_estimators=50)
gbm.fit(
X_train,
......@@ -158,23 +170,20 @@ def test_lambdarank():
eval_set=[(X_test, y_test)],
eval_group=[q_test],
eval_at=[1, 3],
callbacks=[
lgb.early_stopping(10),
lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
]
callbacks=[lgb.early_stopping(10), lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))],
)
assert gbm.best_iteration_ <= 24
assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674
assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578
assert gbm.best_score_["valid_0"]["ndcg@1"] > 0.5674
assert gbm.best_score_["valid_0"]["ndcg@3"] > 0.578
def test_xendcg():
xendcg_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'xendcg'
X_train, y_train = load_svmlight_file(str(xendcg_example_dir / 'rank.train'))
X_test, y_test = load_svmlight_file(str(xendcg_example_dir / 'rank.test'))
q_train = np.loadtxt(str(xendcg_example_dir / 'rank.train.query'))
q_test = np.loadtxt(str(xendcg_example_dir / 'rank.test.query'))
gbm = lgb.LGBMRanker(n_estimators=50, objective='rank_xendcg', random_state=5, n_jobs=1)
xendcg_example_dir = Path(__file__).absolute().parents[2] / "examples" / "xendcg"
X_train, y_train = load_svmlight_file(str(xendcg_example_dir / "rank.train"))
X_test, y_test = load_svmlight_file(str(xendcg_example_dir / "rank.test"))
q_train = np.loadtxt(str(xendcg_example_dir / "rank.train.query"))
q_test = np.loadtxt(str(xendcg_example_dir / "rank.test.query"))
gbm = lgb.LGBMRanker(n_estimators=50, objective="rank_xendcg", random_state=5, n_jobs=1)
gbm.fit(
X_train,
y_train,
......@@ -182,28 +191,25 @@ def test_xendcg():
eval_set=[(X_test, y_test)],
eval_group=[q_test],
eval_at=[1, 3],
eval_metric='ndcg',
callbacks=[
lgb.early_stopping(10),
lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
]
eval_metric="ndcg",
callbacks=[lgb.early_stopping(10), lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))],
)
assert gbm.best_iteration_ <= 24
assert gbm.best_score_['valid_0']['ndcg@1'] > 0.6211
assert gbm.best_score_['valid_0']['ndcg@3'] > 0.6253
assert gbm.best_score_["valid_0"]["ndcg@1"] > 0.6211
assert gbm.best_score_["valid_0"]["ndcg@3"] > 0.6253
def test_eval_at_aliases():
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test'))
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query'))
for alias in lgb.basic._ConfigAliases.get('eval_at'):
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test"))
q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
q_test = np.loadtxt(str(rank_example_dir / "rank.test.query"))
for alias in lgb.basic._ConfigAliases.get("eval_at"):
gbm = lgb.LGBMRanker(n_estimators=5, **{alias: [1, 2, 3, 9]})
with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'eval_at' argument"):
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test])
assert list(gbm.evals_result_['valid_0'].keys()) == ['ndcg@1', 'ndcg@2', 'ndcg@3', 'ndcg@9']
assert list(gbm.evals_result_["valid_0"].keys()) == ["ndcg@1", "ndcg@2", "ndcg@3", "ndcg@9"]
@pytest.mark.parametrize("custom_objective", [True, False])
......@@ -212,20 +218,22 @@ def test_objective_aliases(custom_objective):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
if custom_objective:
obj = custom_dummy_obj
metric_name = 'l2' # default one
metric_name = "l2" # default one
else:
obj = 'mape'
metric_name = 'mape'
obj = "mape"
metric_name = "mape"
evals = []
for alias in lgb.basic._ConfigAliases.get('objective'):
for alias in lgb.basic._ConfigAliases.get("objective"):
gbm = lgb.LGBMRegressor(n_estimators=5, **{alias: obj})
if alias != 'objective':
with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'objective' argument"):
if alias != "objective":
with pytest.warns(
UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'objective' argument"
):
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
else:
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
assert list(gbm.evals_result_['valid_0'].keys()) == [metric_name]
evals.append(gbm.evals_result_['valid_0'][metric_name])
assert list(gbm.evals_result_["valid_0"].keys()) == [metric_name]
evals.append(gbm.evals_result_["valid_0"][metric_name])
evals_t = np.array(evals).T
for i in range(evals_t.shape[0]):
np.testing.assert_allclose(evals_t[i], evals_t[i][0])
......@@ -241,7 +249,7 @@ def test_regression_with_custom_objective():
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
ret = mean_squared_error(y_test, gbm.predict(X_test))
assert ret < 174
assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret)
assert gbm.evals_result_["valid_0"]["l2"][gbm.best_iteration_ - 1] == pytest.approx(ret)
def test_binary_classification_with_custom_objective():
......@@ -260,7 +268,7 @@ def test_binary_classification_with_custom_objective():
def test_dart():
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(boosting_type='dart', n_estimators=50)
gbm = lgb.LGBMRegressor(boosting_type="dart", n_estimators=50)
gbm.fit(X_train, y_train)
score = gbm.score(X_test, y_test)
assert 0.8 <= score <= 1.0
......@@ -269,22 +277,21 @@ def test_dart():
def test_stacking_classifier():
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
classifiers = [('gbm1', lgb.LGBMClassifier(n_estimators=3)),
('gbm2', lgb.LGBMClassifier(n_estimators=3))]
clf = StackingClassifier(estimators=classifiers,
final_estimator=lgb.LGBMClassifier(n_estimators=3),
passthrough=True)
classifiers = [("gbm1", lgb.LGBMClassifier(n_estimators=3)), ("gbm2", lgb.LGBMClassifier(n_estimators=3))]
clf = StackingClassifier(
estimators=classifiers, final_estimator=lgb.LGBMClassifier(n_estimators=3), passthrough=True
)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
assert score >= 0.8
assert score <= 1.
assert score <= 1.0
assert clf.n_features_in_ == 4 # number of input features
assert len(clf.named_estimators_['gbm1'].feature_importances_) == 4
assert clf.named_estimators_['gbm1'].n_features_in_ == clf.named_estimators_['gbm2'].n_features_in_
assert len(clf.named_estimators_["gbm1"].feature_importances_) == 4
assert clf.named_estimators_["gbm1"].n_features_in_ == clf.named_estimators_["gbm2"].n_features_in_
assert clf.final_estimator_.n_features_in_ == 10 # number of concatenated features
assert len(clf.final_estimator_.feature_importances_) == 10
assert all(clf.named_estimators_['gbm1'].classes_ == clf.named_estimators_['gbm2'].classes_)
assert all(clf.classes_ == clf.named_estimators_['gbm1'].classes_)
assert all(clf.named_estimators_["gbm1"].classes_ == clf.named_estimators_["gbm2"].classes_)
assert all(clf.classes_ == clf.named_estimators_["gbm1"].classes_)
def test_stacking_regressor():
......@@ -292,18 +299,15 @@ def test_stacking_regressor():
n_features = X.shape[1]
n_input_models = 2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
regressors = [('gbm1', lgb.LGBMRegressor(n_estimators=3)),
('gbm2', lgb.LGBMRegressor(n_estimators=3))]
reg = StackingRegressor(estimators=regressors,
final_estimator=lgb.LGBMRegressor(n_estimators=3),
passthrough=True)
regressors = [("gbm1", lgb.LGBMRegressor(n_estimators=3)), ("gbm2", lgb.LGBMRegressor(n_estimators=3))]
reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor(n_estimators=3), passthrough=True)
reg.fit(X_train, y_train)
score = reg.score(X_test, y_test)
assert score >= 0.2
assert score <= 1.
assert score <= 1.0
assert reg.n_features_in_ == n_features # number of input features
assert len(reg.named_estimators_['gbm1'].feature_importances_) == n_features
assert reg.named_estimators_['gbm1'].n_features_in_ == reg.named_estimators_['gbm2'].n_features_in_
assert len(reg.named_estimators_["gbm1"].feature_importances_) == n_features
assert reg.named_estimators_["gbm1"].n_features_in_ == reg.named_estimators_["gbm2"].n_features_in_
assert reg.final_estimator_.n_features_in_ == n_features + n_input_models # number of concatenated features
assert len(reg.final_estimator_.feature_importances_) == n_features + n_input_models
......@@ -313,91 +317,69 @@ def test_grid_search():
y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
params = {
"subsample": 0.8,
"subsample_freq": 1
}
grid_params = {
"boosting_type": ['rf', 'gbdt'],
"n_estimators": [4, 6],
"reg_alpha": [0.01, 0.005]
}
params = {"subsample": 0.8, "subsample_freq": 1}
grid_params = {"boosting_type": ["rf", "gbdt"], "n_estimators": [4, 6], "reg_alpha": [0.01, 0.005]}
evals_result = {}
fit_params = {
"eval_set": [(X_val, y_val)],
"eval_metric": constant_metric,
"callbacks": [
lgb.early_stopping(2),
lgb.record_evaluation(evals_result)
]
"callbacks": [lgb.early_stopping(2), lgb.record_evaluation(evals_result)],
}
grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
grid.fit(X_train, y_train, **fit_params)
score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True
assert grid.best_params_['boosting_type'] in ['rf', 'gbdt']
assert grid.best_params_['n_estimators'] in [4, 6]
assert grid.best_params_['reg_alpha'] in [0.01, 0.005]
assert grid.best_score_ <= 1.
assert grid.best_params_["boosting_type"] in ["rf", "gbdt"]
assert grid.best_params_["n_estimators"] in [4, 6]
assert grid.best_params_["reg_alpha"] in [0.01, 0.005]
assert grid.best_score_ <= 1.0
assert grid.best_estimator_.best_iteration_ == 1
assert grid.best_estimator_.best_score_['valid_0']['multi_logloss'] < 0.25
assert grid.best_estimator_.best_score_['valid_0']['error'] == 0
assert grid.best_estimator_.best_score_["valid_0"]["multi_logloss"] < 0.25
assert grid.best_estimator_.best_score_["valid_0"]["error"] == 0
assert score >= 0.2
assert score <= 1.
assert score <= 1.0
assert evals_result == grid.best_estimator_.evals_result_
def test_random_search():
X, y = load_iris(return_X_y=True)
y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
n_iter = 3 # Number of samples
params = {
"subsample": 0.8,
"subsample_freq": 1
}
params = {"subsample": 0.8, "subsample_freq": 1}
param_dist = {
"boosting_type": ['rf', 'gbdt'],
"boosting_type": ["rf", "gbdt"],
"n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
"reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]
}
fit_params = {
"eval_set": [(X_val, y_val)],
"eval_metric": constant_metric,
"callbacks": [lgb.early_stopping(2)]
"reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)],
}
rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params),
param_distributions=param_dist, cv=2,
n_iter=n_iter, random_state=42)
fit_params = {"eval_set": [(X_val, y_val)], "eval_metric": constant_metric, "callbacks": [lgb.early_stopping(2)]}
rand = RandomizedSearchCV(
estimator=lgb.LGBMClassifier(**params), param_distributions=param_dist, cv=2, n_iter=n_iter, random_state=42
)
rand.fit(X_train, y_train, **fit_params)
score = rand.score(X_test, y_test) # utilizes RandomizedSearchCV default refit=True
assert rand.best_params_['boosting_type'] in ['rf', 'gbdt']
assert rand.best_params_['n_estimators'] in list(range(3, 10))
assert rand.best_params_['reg_alpha'] >= 0.01 # Left-closed boundary point
assert rand.best_params_['reg_alpha'] <= 0.06 # Right-closed boundary point
assert rand.best_score_ <= 1.
assert rand.best_estimator_.best_score_['valid_0']['multi_logloss'] < 0.25
assert rand.best_estimator_.best_score_['valid_0']['error'] == 0
assert rand.best_params_["boosting_type"] in ["rf", "gbdt"]
assert rand.best_params_["n_estimators"] in list(range(3, 10))
assert rand.best_params_["reg_alpha"] >= 0.01 # Left-closed boundary point
assert rand.best_params_["reg_alpha"] <= 0.06 # Right-closed boundary point
assert rand.best_score_ <= 1.0
assert rand.best_estimator_.best_score_["valid_0"]["multi_logloss"] < 0.25
assert rand.best_estimator_.best_score_["valid_0"]["error"] == 0
assert score >= 0.2
assert score <= 1.
assert score <= 1.0
def test_multioutput_classifier():
n_outputs = 3
X, y = make_multilabel_classification(n_samples=100, n_features=20,
n_classes=n_outputs, random_state=0)
X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=n_outputs, random_state=0)
y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
clf = MultiOutputClassifier(estimator=lgb.LGBMClassifier(n_estimators=10))
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
assert score >= 0.2
assert score <= 1.
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs),
np.concatenate(clf.classes_))
assert score <= 1.0
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
for classifier in clf.estimators_:
assert isinstance(classifier, lgb.LGBMClassifier)
assert isinstance(classifier.booster_, lgb.Booster)
......@@ -405,15 +387,14 @@ def test_multioutput_classifier():
def test_multioutput_regressor():
bunch = load_linnerud(as_frame=True) # returns a Bunch instance
X, y = bunch['data'], bunch['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=42)
X, y = bunch["data"], bunch["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
reg = MultiOutputRegressor(estimator=lgb.LGBMRegressor(n_estimators=10))
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
_, score, _ = mse(y_test, y_pred)
assert score >= 0.2
assert score <= 120.
assert score <= 120.0
for regressor in reg.estimators_:
assert isinstance(regressor, lgb.LGBMRegressor)
assert isinstance(regressor.booster_, lgb.Booster)
......@@ -421,19 +402,15 @@ def test_multioutput_regressor():
def test_classifier_chain():
n_outputs = 3
X, y = make_multilabel_classification(n_samples=100, n_features=20,
n_classes=n_outputs, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
random_state=42)
X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=n_outputs, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
order = [2, 0, 1]
clf = ClassifierChain(base_estimator=lgb.LGBMClassifier(n_estimators=10),
order=order, random_state=42)
clf = ClassifierChain(base_estimator=lgb.LGBMClassifier(n_estimators=10), order=order, random_state=42)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
assert score >= 0.2
assert score <= 1.
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs),
np.concatenate(clf.classes_))
assert score <= 1.0
np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
assert order == clf.order_
for classifier in clf.estimators_:
assert isinstance(classifier, lgb.LGBMClassifier)
......@@ -442,16 +419,15 @@ def test_classifier_chain():
def test_regressor_chain():
bunch = load_linnerud(as_frame=True) # returns a Bunch instance
X, y = bunch['data'], bunch['target']
X, y = bunch["data"], bunch["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
order = [2, 0, 1]
reg = RegressorChain(base_estimator=lgb.LGBMRegressor(n_estimators=10), order=order,
random_state=42)
reg = RegressorChain(base_estimator=lgb.LGBMRegressor(n_estimators=10), order=order, random_state=42)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
_, score, _ = mse(y_test, y_pred)
assert score >= 0.2
assert score <= 120.
assert score <= 120.0
assert order == reg.order_
for regressor in reg.estimators_:
assert isinstance(regressor, lgb.LGBMRegressor)
......@@ -489,24 +465,17 @@ def test_clone_and_property():
def test_joblib():
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
verbose=-1, importance_type='split')
gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, verbose=-1, importance_type="split")
gbm.fit(
X_train,
y_train,
eval_set=[
(X_train, y_train),
(X_test, y_test)
],
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric=mse,
callbacks=[
lgb.early_stopping(5),
lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))
]
callbacks=[lgb.early_stopping(5), lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))],
)
joblib.dump(gbm, 'lgb.pkl') # test model with custom functions
gbm_pickle = joblib.load('lgb.pkl')
joblib.dump(gbm, "lgb.pkl") # test model with custom functions
gbm_pickle = joblib.load("lgb.pkl")
assert isinstance(gbm_pickle.booster_, lgb.Booster)
assert gbm.get_params() == gbm_pickle.get_params()
np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
......@@ -515,8 +484,7 @@ def test_joblib():
for eval_set in gbm.evals_result_:
for metric in gbm.evals_result_[eval_set]:
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric],
gbm_pickle.evals_result_[eval_set][metric])
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], gbm_pickle.evals_result_[eval_set][metric])
pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test)
np.testing.assert_allclose(pred_origin, pred_pickle)
......@@ -526,7 +494,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
unpicklable_callback = UnpicklableCallback()
with pytest.raises(Exception, match="This class in not picklable"):
joblib.dump(unpicklable_callback, tmp_path / 'tmp.joblib')
joblib.dump(unpicklable_callback, tmp_path / "tmp.joblib")
X, y = make_synthetic_regression()
gbm = lgb.LGBMRegressor(n_estimators=5)
......@@ -578,9 +546,9 @@ def test_feature_importances_type():
data = load_iris(return_X_y=False)
clf = lgb.LGBMClassifier(n_estimators=10)
clf.fit(data.data, data.target)
clf.set_params(importance_type='split')
clf.set_params(importance_type="split")
importances_split = clf.feature_importances_
clf.set_params(importance_type='gain')
clf.set_params(importance_type="gain")
importances_gain = clf.feature_importances_
# Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
importance_split_top1 = sorted(importances_split, reverse=True)[0]
......@@ -591,38 +559,44 @@ def test_feature_importances_type():
def test_pandas_categorical():
pd = pytest.importorskip("pandas")
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
X = pd.DataFrame(
{
"A": np.random.permutation(["a", "b", "c", "d"] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
}
) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(np.random.permutation(['z', 'y'] * 30),
ordered=True)})
X_test = pd.DataFrame(
{
"A": np.random.permutation(["a", "b", "e"] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(np.random.permutation(["z", "y"] * 30), ordered=True),
}
)
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
X[cat_cols_actual] = X[cat_cols_actual].astype("category")
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype("category")
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred0 = gbm0.predict(X_test, raw_score=True)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test, raw_score=True)
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A'])
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A"])
pred2 = gbm2.predict(X_test, raw_score=True)
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A", "B", "C", "D"])
pred3 = gbm3.predict(X_test, raw_score=True)
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
gbm3.booster_.save_model("categorical.model")
gbm4 = lgb.Booster(model_file="categorical.model")
pred4 = gbm4.predict(X_test)
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A", "B", "C", "D", "E"])
pred5 = gbm5.predict(X_test, raw_score=True)
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
pred6 = gbm6.predict(X_test, raw_score=True)
......@@ -648,18 +622,26 @@ def test_pandas_categorical():
def test_pandas_sparse():
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150))})
X = pd.DataFrame(
{
"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150)),
}
)
y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30))})
X_test = pd.DataFrame(
{
"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30)),
}
)
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
assert pd.api.types.is_sparse(dtype)
gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
if hasattr(X_test, "sparse"):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
......@@ -669,13 +651,9 @@ def test_pandas_sparse():
def test_predict():
# With default params
iris = load_iris(return_X_y=False)
X_train, X_test, y_train, _ = train_test_split(iris.data, iris.target,
test_size=0.2, random_state=42)
X_train, X_test, y_train, _ = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
gbm = lgb.train({'objective': 'multiclass',
'num_class': 3,
'verbose': -1},
lgb.Dataset(X_train, y_train))
gbm = lgb.train({"objective": "multiclass", "num_class": 3, "verbose": -1}, lgb.Dataset(X_train, y_train))
clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train)
# Tests same probabilities
......@@ -705,9 +683,7 @@ def test_predict():
# Tests other parameters for the prediction works
res_engine = gbm.predict(X_test)
res_sklearn_params = clf.predict_proba(X_test,
pred_early_stop=True,
pred_early_stop_margin=1.0)
res_sklearn_params = clf.predict_proba(X_test, pred_early_stop=True, pred_early_stop_margin=1.0)
with pytest.raises(AssertionError):
np.testing.assert_allclose(res_engine, res_sklearn_params)
......@@ -739,9 +715,7 @@ def test_predict():
# Tests other parameters for the prediction works, starting from iteration 10
res_engine = gbm.predict(X_test, start_iteration=10)
res_sklearn_params = clf.predict_proba(X_test,
pred_early_stop=True,
pred_early_stop_margin=1.0, start_iteration=10)
res_sklearn_params = clf.predict_proba(X_test, pred_early_stop=True, pred_early_stop_margin=1.0, start_iteration=10)
with pytest.raises(AssertionError):
np.testing.assert_allclose(res_engine, res_sklearn_params)
......@@ -750,34 +724,43 @@ def test_predict_with_params_from_init():
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
predict_params = {
'pred_early_stop': True,
'pred_early_stop_margin': 1.0
}
predict_params = {"pred_early_stop": True, "pred_early_stop_margin": 1.0}
y_preds_no_params = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(
X_test, raw_score=True)
y_preds_no_params = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(X_test, raw_score=True)
y_preds_params_in_predict = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(
X_test, raw_score=True, **predict_params)
y_preds_params_in_predict = (
lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(X_test, raw_score=True, **predict_params)
)
with pytest.raises(AssertionError):
np.testing.assert_allclose(y_preds_no_params, y_preds_params_in_predict)
y_preds_params_in_set_params_before_fit = lgb.LGBMClassifier(verbose=-1).set_params(
**predict_params).fit(X_train, y_train).predict(X_test, raw_score=True)
y_preds_params_in_set_params_before_fit = (
lgb.LGBMClassifier(verbose=-1)
.set_params(**predict_params)
.fit(X_train, y_train)
.predict(X_test, raw_score=True)
)
np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_before_fit)
y_preds_params_in_set_params_after_fit = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).set_params(
**predict_params).predict(X_test, raw_score=True)
y_preds_params_in_set_params_after_fit = (
lgb.LGBMClassifier(verbose=-1)
.fit(X_train, y_train)
.set_params(**predict_params)
.predict(X_test, raw_score=True)
)
np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_after_fit)
y_preds_params_in_init = lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict(
X_test, raw_score=True)
y_preds_params_in_init = (
lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict(X_test, raw_score=True)
)
np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_init)
# test that params passed in predict have higher priority
y_preds_params_overwritten = lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict(
X_test, raw_score=True, pred_early_stop=False)
y_preds_params_overwritten = (
lgb.LGBMClassifier(verbose=-1, **predict_params)
.fit(X_train, y_train)
.predict(X_test, raw_score=True, pred_early_stop=False)
)
np.testing.assert_allclose(y_preds_no_params, y_preds_params_overwritten)
......@@ -787,315 +770,307 @@ def test_evaluate_train_set():
gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
assert len(gbm.evals_result_) == 2
assert 'training' in gbm.evals_result_
assert len(gbm.evals_result_['training']) == 1
assert 'l2' in gbm.evals_result_['training']
assert 'valid_1' in gbm.evals_result_
assert len(gbm.evals_result_['valid_1']) == 1
assert 'l2' in gbm.evals_result_['valid_1']
assert "training" in gbm.evals_result_
assert len(gbm.evals_result_["training"]) == 1
assert "l2" in gbm.evals_result_["training"]
assert "valid_1" in gbm.evals_result_
assert len(gbm.evals_result_["valid_1"]) == 1
assert "l2" in gbm.evals_result_["valid_1"]
def test_metrics():
X, y = make_synthetic_regression()
y = abs(y)
params = {'n_estimators': 2, 'verbose': -1}
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)}
params = {"n_estimators": 2, "verbose": -1}
params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# no custom objective, no custom metric
# default metric
gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'l2' in gbm.evals_result_['training']
assert len(gbm.evals_result_["training"]) == 1
assert "l2" in gbm.evals_result_["training"]
# non-default metric
gbm = lgb.LGBMRegressor(metric='mape', **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(metric="mape", **params).fit(**params_fit)
assert len(gbm.evals_result_["training"]) == 1
assert "mape" in gbm.evals_result_["training"]
# no metric
gbm = lgb.LGBMRegressor(metric='None', **params).fit(**params_fit)
gbm = lgb.LGBMRegressor(metric="None", **params).fit(**params_fit)
assert gbm.evals_result_ == {}
# non-default metric in eval_metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric='mape', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'l2' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(**params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# non-default metric with non-default metric in eval_metric
gbm = lgb.LGBMRegressor(metric='gamma', **params).fit(eval_metric='mape', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'gamma' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(metric="gamma", **params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "gamma" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# non-default metric with multiple metrics in eval_metric
gbm = lgb.LGBMRegressor(metric='gamma',
**params).fit(eval_metric=['l2', 'mape'], **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'gamma' in gbm.evals_result_['training']
assert 'l2' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(metric="gamma", **params).fit(eval_metric=["l2", "mape"], **params_fit)
assert len(gbm.evals_result_["training"]) == 3
assert "gamma" in gbm.evals_result_["training"]
assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# non-default metric with multiple metrics in eval_metric for LGBMClassifier
X_classification, y_classification = load_breast_cancer(return_X_y=True)
params_classification = {'n_estimators': 2, 'verbose': -1,
'objective': 'binary', 'metric': 'binary_logloss'}
params_fit_classification = {'X': X_classification, 'y': y_classification,
'eval_set': (X_classification, y_classification)}
gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=['fair', 'error'],
**params_fit_classification)
assert len(gbm.evals_result_['training']) == 3
assert 'fair' in gbm.evals_result_['training']
assert 'binary_error' in gbm.evals_result_['training']
assert 'binary_logloss' in gbm.evals_result_['training']
params_classification = {"n_estimators": 2, "verbose": -1, "objective": "binary", "metric": "binary_logloss"}
params_fit_classification = {
"X": X_classification,
"y": y_classification,
"eval_set": (X_classification, y_classification),
}
gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=["fair", "error"], **params_fit_classification)
assert len(gbm.evals_result_["training"]) == 3
assert "fair" in gbm.evals_result_["training"]
assert "binary_error" in gbm.evals_result_["training"]
assert "binary_logloss" in gbm.evals_result_["training"]
# default metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'l1' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(**params_fit)
assert len(gbm.evals_result_["training"]) == 1
assert "l1" in gbm.evals_result_["training"]
# non-default metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape',
**params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", metric="mape", **params).fit(**params_fit)
assert len(gbm.evals_result_["training"]) == 1
assert "mape" in gbm.evals_result_["training"]
# no metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None',
**params).fit(**params_fit)
gbm = lgb.LGBMRegressor(objective="regression_l1", metric="None", **params).fit(**params_fit)
assert gbm.evals_result_ == {}
# non-default metric in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1',
**params).fit(eval_metric='mape', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'l1' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "l1" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# non-default metric with non-default metric in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma',
**params).fit(eval_metric='mape', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'gamma' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", metric="gamma", **params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "gamma" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# non-default metric with multiple metrics in eval_metric for non-default objective
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='gamma',
**params).fit(eval_metric=['l2', 'mape'], **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'gamma' in gbm.evals_result_['training']
assert 'l2' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", metric="gamma", **params).fit(
eval_metric=["l2", "mape"], **params_fit
)
assert len(gbm.evals_result_["training"]) == 3
assert "gamma" in gbm.evals_result_["training"]
assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# custom objective, no custom metric
# default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'l2' in gbm.evals_result_['training']
assert len(gbm.evals_result_["training"]) == 1
assert "l2" in gbm.evals_result_["training"]
# non-default regression metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape', **params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(**params_fit)
assert len(gbm.evals_result_["training"]) == 1
assert "mape" in gbm.evals_result_["training"]
# multiple regression metrics for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(**params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'l1' in gbm.evals_result_['training']
assert 'gamma' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(**params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "l1" in gbm.evals_result_["training"]
assert "gamma" in gbm.evals_result_["training"]
# no metric
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='None',
**params).fit(**params_fit)
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="None", **params).fit(**params_fit)
assert gbm.evals_result_ == {}
# default regression metric with non-default metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj,
**params).fit(eval_metric='mape', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'l2' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(eval_metric="mape", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# non-default regression metric with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape',
**params).fit(eval_metric='gamma', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'mape' in gbm.evals_result_['training']
assert 'gamma' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(eval_metric="gamma", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "mape" in gbm.evals_result_["training"]
assert "gamma" in gbm.evals_result_["training"]
# multiple regression metrics with metric in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(eval_metric='l2', **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'l1' in gbm.evals_result_['training']
assert 'gamma' in gbm.evals_result_['training']
assert 'l2' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(
eval_metric="l2", **params_fit
)
assert len(gbm.evals_result_["training"]) == 3
assert "l1" in gbm.evals_result_["training"]
assert "gamma" in gbm.evals_result_["training"]
assert "l2" in gbm.evals_result_["training"]
# multiple regression metrics with multiple metrics in eval_metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l1', 'gamma'],
**params).fit(eval_metric=['l2', 'mape'], **params_fit)
assert len(gbm.evals_result_['training']) == 4
assert 'l1' in gbm.evals_result_['training']
assert 'gamma' in gbm.evals_result_['training']
assert 'l2' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(
eval_metric=["l2", "mape"], **params_fit
)
assert len(gbm.evals_result_["training"]) == 4
assert "l1" in gbm.evals_result_["training"]
assert "gamma" in gbm.evals_result_["training"]
assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
# no custom objective, custom metric
# default metric with custom metric
gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'l2' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
assert len(gbm.evals_result_["training"]) == 2
assert "l2" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# non-default metric with custom metric
gbm = lgb.LGBMRegressor(metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'mape' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(metric="mape", **params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# multiple metrics with custom metric
gbm = lgb.LGBMRegressor(metric=['l1', 'gamma'],
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'l1' in gbm.evals_result_['training']
assert 'gamma' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(metric=["l1", "gamma"], **params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_["training"]) == 3
assert "l1" in gbm.evals_result_["training"]
assert "gamma" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# custom metric (disable default metric)
gbm = lgb.LGBMRegressor(metric='None',
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(metric="None", **params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_["training"]) == 1
assert "error" in gbm.evals_result_["training"]
# default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1',
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'l1' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "l1" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# non-default metric for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'mape' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", metric="mape", **params).fit(
eval_metric=constant_metric, **params_fit
)
assert len(gbm.evals_result_["training"]) == 2
assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# multiple metrics for non-default objective with custom metric
gbm = lgb.LGBMRegressor(objective='regression_l1', metric=['l1', 'gamma'],
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'l1' in gbm.evals_result_['training']
assert 'gamma' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", metric=["l1", "gamma"], **params).fit(
eval_metric=constant_metric, **params_fit
)
assert len(gbm.evals_result_["training"]) == 3
assert "l1" in gbm.evals_result_["training"]
assert "gamma" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# custom metric (disable default metric for non-default objective)
gbm = lgb.LGBMRegressor(objective='regression_l1', metric='None',
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective="regression_l1", metric="None", **params).fit(
eval_metric=constant_metric, **params_fit
)
assert len(gbm.evals_result_["training"]) == 1
assert "error" in gbm.evals_result_["training"]
# custom objective, custom metric
# custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj,
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "error" in gbm.evals_result_["training"]
# non-default regression metric with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric='mape',
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'mape' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(
eval_metric=constant_metric, **params_fit
)
assert len(gbm.evals_result_["training"]) == 2
assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
# multiple regression metrics with custom metric for custom objective
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=['l2', 'mape'],
**params).fit(eval_metric=constant_metric, **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'l2' in gbm.evals_result_['training']
assert 'mape' in gbm.evals_result_['training']
assert 'error' in gbm.evals_result_['training']
gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l2", "mape"], **params).fit(
eval_metric=constant_metric, **params_fit
)
assert len(gbm.evals_result_["training"]) == 3
assert "l2" in gbm.evals_result_["training"]
assert "mape" in gbm.evals_result_["training"]
assert "error" in gbm.evals_result_["training"]
X, y = load_digits(n_class=3, return_X_y=True)
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)}
params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# default metric and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='binary_error', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'multi_logloss' in gbm.evals_result_['training']
assert 'multi_error' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(**params).fit(eval_metric="binary_error", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "multi_logloss" in gbm.evals_result_["training"]
assert "multi_error" in gbm.evals_result_["training"]
# invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='binary_error', **params_fit)
assert gbm.objective_ == 'multiclass'
assert len(gbm.evals_result_['training']) == 2
assert 'multi_logloss' in gbm.evals_result_['training']
assert 'multi_error' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(**params).fit(eval_metric="binary_error", **params_fit)
assert gbm.objective_ == "multiclass"
assert len(gbm.evals_result_["training"]) == 2
assert "multi_logloss" in gbm.evals_result_["training"]
assert "multi_error" in gbm.evals_result_["training"]
# default metric for non-default multiclass objective
# and invalid binary metric is replaced with multiclass alternative
gbm = lgb.LGBMClassifier(objective='ovr',
**params).fit(eval_metric='binary_error', **params_fit)
assert gbm.objective_ == 'ovr'
assert len(gbm.evals_result_['training']) == 2
assert 'multi_logloss' in gbm.evals_result_['training']
assert 'multi_error' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(objective="ovr", **params).fit(eval_metric="binary_error", **params_fit)
assert gbm.objective_ == "ovr"
assert len(gbm.evals_result_["training"]) == 2
assert "multi_logloss" in gbm.evals_result_["training"]
assert "multi_error" in gbm.evals_result_["training"]
X, y = load_digits(n_class=2, return_X_y=True)
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)}
params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# default metric and invalid multiclass metric is replaced with binary alternative
gbm = lgb.LGBMClassifier(**params).fit(eval_metric='multi_error', **params_fit)
assert len(gbm.evals_result_['training']) == 2
assert 'binary_logloss' in gbm.evals_result_['training']
assert 'binary_error' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(**params).fit(eval_metric="multi_error", **params_fit)
assert len(gbm.evals_result_["training"]) == 2
assert "binary_logloss" in gbm.evals_result_["training"]
assert "binary_error" in gbm.evals_result_["training"]
# invalid multiclass metric is replaced with binary alternative for custom objective
gbm = lgb.LGBMClassifier(objective=custom_dummy_obj,
**params).fit(eval_metric='multi_logloss', **params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'binary_logloss' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(objective=custom_dummy_obj, **params).fit(eval_metric="multi_logloss", **params_fit)
assert len(gbm.evals_result_["training"]) == 1
assert "binary_logloss" in gbm.evals_result_["training"]
def test_multiple_eval_metrics():
X, y = load_breast_cancer(return_X_y=True)
params = {'n_estimators': 2, 'verbose': -1, 'objective': 'binary', 'metric': 'binary_logloss'}
params_fit = {'X': X, 'y': y, 'eval_set': (X, y)}
params = {"n_estimators": 2, "verbose": -1, "objective": "binary", "metric": "binary_logloss"}
params_fit = {"X": X, "y": y, "eval_set": (X, y)}
# Verify that can receive a list of metrics, only callable
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric], **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'error' in gbm.evals_result_['training']
assert 'decreasing_metric' in gbm.evals_result_['training']
assert 'binary_logloss' in gbm.evals_result_['training']
assert len(gbm.evals_result_["training"]) == 3
assert "error" in gbm.evals_result_["training"]
assert "decreasing_metric" in gbm.evals_result_["training"]
assert "binary_logloss" in gbm.evals_result_["training"]
# Verify that can receive a list of custom and built-in metrics
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, 'fair'], **params_fit)
assert len(gbm.evals_result_['training']) == 4
assert 'error' in gbm.evals_result_['training']
assert 'decreasing_metric' in gbm.evals_result_['training']
assert 'binary_logloss' in gbm.evals_result_['training']
assert 'fair' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, "fair"], **params_fit)
assert len(gbm.evals_result_["training"]) == 4
assert "error" in gbm.evals_result_["training"]
assert "decreasing_metric" in gbm.evals_result_["training"]
assert "binary_logloss" in gbm.evals_result_["training"]
assert "fair" in gbm.evals_result_["training"]
# Verify that works as expected when eval_metric is empty
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[], **params_fit)
assert len(gbm.evals_result_['training']) == 1
assert 'binary_logloss' in gbm.evals_result_['training']
assert len(gbm.evals_result_["training"]) == 1
assert "binary_logloss" in gbm.evals_result_["training"]
# Verify that can receive a list of metrics, only built-in
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error'], **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'binary_logloss' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=["fair", "error"], **params_fit)
assert len(gbm.evals_result_["training"]) == 3
assert "binary_logloss" in gbm.evals_result_["training"]
# Verify that eval_metric is robust to receiving a list with None
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=['fair', 'error', None], **params_fit)
assert len(gbm.evals_result_['training']) == 3
assert 'binary_logloss' in gbm.evals_result_['training']
gbm = lgb.LGBMClassifier(**params).fit(eval_metric=["fair", "error", None], **params_fit)
assert len(gbm.evals_result_["training"]) == 3
assert "binary_logloss" in gbm.evals_result_["training"]
def test_nan_handle():
......@@ -1104,18 +1079,18 @@ def test_nan_handle():
X = np.random.randn(nrows, ncols)
y = np.random.randn(nrows) + np.full(nrows, 1e30)
weight = np.zeros(nrows)
params = {'n_estimators': 20, 'verbose': -1}
params_fit = {'X': X, 'y': y, 'sample_weight': weight, 'eval_set': (X, y),
'callbacks': [lgb.early_stopping(5)]}
params = {"n_estimators": 20, "verbose": -1}
params_fit = {"X": X, "y": y, "sample_weight": weight, "eval_set": (X, y), "callbacks": [lgb.early_stopping(5)]}
gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan)
np.testing.assert_allclose(gbm.evals_result_["training"]["l2"], np.nan)
@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_first_metric_only():
def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
params['first_metric_only'] = first_metric_only
params["first_metric_only"] = first_metric_only
gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
assert len(gbm.evals_result_) == len(eval_set_names)
for eval_set_name in eval_set_names:
......@@ -1125,11 +1100,13 @@ def test_first_metric_only():
assert metric_name in gbm.evals_result_[eval_set_name]
actual = len(gbm.evals_result_[eval_set_name][metric_name])
expected = assumed_iteration + (params['early_stopping_rounds']
if eval_set_name != 'training'
and assumed_iteration != gbm.n_estimators else 0)
expected = assumed_iteration + (
params["early_stopping_rounds"]
if eval_set_name != "training" and assumed_iteration != gbm.n_estimators
else 0
)
assert expected == actual
if eval_set_name != 'training':
if eval_set_name != "training":
assert assumed_iteration == gbm.best_iteration_
else:
assert gbm.n_estimators == gbm.best_iteration_
......@@ -1137,14 +1114,15 @@ def test_first_metric_only():
X, y = make_synthetic_regression(n_samples=300)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72)
params = {'n_estimators': 30,
'learning_rate': 0.8,
'num_leaves': 15,
'verbose': -1,
'seed': 123,
'early_stopping_rounds': 5} # early stop should be supported via global LightGBM parameter
params_fit = {'X': X_train,
'y': y_train}
params = {
"n_estimators": 30,
"learning_rate": 0.8,
"num_leaves": 15,
"verbose": -1,
"seed": 123,
"early_stopping_rounds": 5,
} # early stop should be supported via global LightGBM parameter
params_fit = {"X": X_train, "y": y_train}
iter_valid1_l1 = 4
iter_valid1_l2 = 4
......@@ -1157,100 +1135,116 @@ def test_first_metric_only():
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
# feval
params['metric'] = 'None'
params_fit['eval_metric'] = lambda preds, train_data: [decreasing_metric(preds, train_data),
constant_metric(preds, train_data)]
params_fit['eval_set'] = (X_test1, y_test1)
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 1, False)
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 30, True)
params_fit['eval_metric'] = lambda preds, train_data: [constant_metric(preds, train_data),
decreasing_metric(preds, train_data)]
fit_and_check(['valid_0'], ['decreasing_metric', 'error'], 1, True)
params["metric"] = "None"
params_fit["eval_metric"] = lambda preds, train_data: [
decreasing_metric(preds, train_data),
constant_metric(preds, train_data),
]
params_fit["eval_set"] = (X_test1, y_test1)
fit_and_check(["valid_0"], ["decreasing_metric", "error"], 1, False)
fit_and_check(["valid_0"], ["decreasing_metric", "error"], 30, True)
params_fit["eval_metric"] = lambda preds, train_data: [
constant_metric(preds, train_data),
decreasing_metric(preds, train_data),
]
fit_and_check(["valid_0"], ["decreasing_metric", "error"], 1, True)
# single eval_set
params.pop('metric')
params_fit.pop('eval_metric')
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, False)
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, True)
params.pop("metric")
params_fit.pop("eval_metric")
fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
params_fit['eval_metric'] = "l2"
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, False)
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, True)
params_fit["eval_metric"] = "l2"
fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
params_fit['eval_metric'] = "l1"
fit_and_check(['valid_0'], ['l1', 'l2'], iter_min_valid1, False)
fit_and_check(['valid_0'], ['l1', 'l2'], iter_valid1_l1, True)
params_fit["eval_metric"] = "l1"
fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l1, True)
params_fit['eval_metric'] = ["l1", "l2"]
fit_and_check(['valid_0'], ['l1', 'l2'], iter_min_valid1, False)
fit_and_check(['valid_0'], ['l1', 'l2'], iter_valid1_l1, True)
params_fit["eval_metric"] = ["l1", "l2"]
fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l1, True)
params_fit['eval_metric'] = ["l2", "l1"]
fit_and_check(['valid_0'], ['l1', 'l2'], iter_min_valid1, False)
fit_and_check(['valid_0'], ['l1', 'l2'], iter_valid1_l2, True)
params_fit["eval_metric"] = ["l2", "l1"]
fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l2, True)
params_fit['eval_metric'] = ["l2", "regression", "mse"] # test aliases
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, False)
fit_and_check(['valid_0'], ['l2'], iter_valid1_l2, True)
params_fit["eval_metric"] = ["l2", "regression", "mse"] # test aliases
fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
# two eval_set
params_fit['eval_set'] = [(X_test1, y_test1), (X_test2, y_test2)]
params_fit['eval_metric'] = ["l1", "l2"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l1, True)
params_fit['eval_metric'] = ["l2", "l1"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True)
params_fit["eval_set"] = [(X_test1, y_test1), (X_test2, y_test2)]
params_fit["eval_metric"] = ["l1", "l2"]
fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l1, True)
params_fit["eval_metric"] = ["l2", "l1"]
fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l2, True)
params_fit['eval_set'] = [(X_test2, y_test2), (X_test1, y_test1)]
params_fit['eval_metric'] = ["l1", "l2"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min, False)
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l1, True)
params_fit['eval_metric'] = ["l2", "l1"]
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min, False)
fit_and_check(['valid_0', 'valid_1'], ['l1', 'l2'], iter_min_l2, True)
params_fit["eval_set"] = [(X_test2, y_test2), (X_test1, y_test1)]
params_fit["eval_metric"] = ["l1", "l2"]
fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min, False)
fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l1, True)
params_fit["eval_metric"] = ["l2", "l1"]
fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min, False)
fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l2, True)
def test_class_weight():
X, y = load_digits(n_class=10, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_str = y_train.astype('str')
y_test_str = y_test.astype('str')
gbm = lgb.LGBMClassifier(n_estimators=10, class_weight='balanced', verbose=-1)
gbm.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test), (X_test, y_test),
(X_test, y_test), (X_test, y_test)],
eval_class_weight=['balanced', None, 'balanced', {1: 10, 4: 20}, {5: 30, 2: 40}])
y_train_str = y_train.astype("str")
y_test_str = y_test.astype("str")
gbm = lgb.LGBMClassifier(n_estimators=10, class_weight="balanced", verbose=-1)
gbm.fit(
X_train,
y_train,
eval_set=[(X_train, y_train), (X_test, y_test), (X_test, y_test), (X_test, y_test), (X_test, y_test)],
eval_class_weight=["balanced", None, "balanced", {1: 10, 4: 20}, {5: 30, 2: 40}],
)
for eval_set1, eval_set2 in itertools.combinations(gbm.evals_result_.keys(), 2):
for metric in gbm.evals_result_[eval_set1]:
np.testing.assert_raises(AssertionError,
np.testing.assert_allclose,
gbm.evals_result_[eval_set1][metric],
gbm.evals_result_[eval_set2][metric])
gbm_str = lgb.LGBMClassifier(n_estimators=10, class_weight='balanced', verbose=-1)
gbm_str.fit(X_train, y_train_str,
eval_set=[(X_train, y_train_str), (X_test, y_test_str),
(X_test, y_test_str), (X_test, y_test_str), (X_test, y_test_str)],
eval_class_weight=['balanced', None, 'balanced', {'1': 10, '4': 20}, {'5': 30, '2': 40}])
np.testing.assert_raises(
AssertionError,
np.testing.assert_allclose,
gbm.evals_result_[eval_set1][metric],
gbm.evals_result_[eval_set2][metric],
)
gbm_str = lgb.LGBMClassifier(n_estimators=10, class_weight="balanced", verbose=-1)
gbm_str.fit(
X_train,
y_train_str,
eval_set=[
(X_train, y_train_str),
(X_test, y_test_str),
(X_test, y_test_str),
(X_test, y_test_str),
(X_test, y_test_str),
],
eval_class_weight=["balanced", None, "balanced", {"1": 10, "4": 20}, {"5": 30, "2": 40}],
)
for eval_set1, eval_set2 in itertools.combinations(gbm_str.evals_result_.keys(), 2):
for metric in gbm_str.evals_result_[eval_set1]:
np.testing.assert_raises(AssertionError,
np.testing.assert_allclose,
gbm_str.evals_result_[eval_set1][metric],
gbm_str.evals_result_[eval_set2][metric])
np.testing.assert_raises(
AssertionError,
np.testing.assert_allclose,
gbm_str.evals_result_[eval_set1][metric],
gbm_str.evals_result_[eval_set2][metric],
)
for eval_set in gbm.evals_result_:
for metric in gbm.evals_result_[eval_set]:
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric],
gbm_str.evals_result_[eval_set][metric])
np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], gbm_str.evals_result_[eval_set][metric])
def test_continue_training_with_model():
X, y = load_digits(n_class=3, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test))
gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test),
init_model=init_gbm)
assert len(init_gbm.evals_result_['valid_0']['multi_logloss']) == len(gbm.evals_result_['valid_0']['multi_logloss'])
assert len(init_gbm.evals_result_['valid_0']['multi_logloss']) == 5
assert gbm.evals_result_['valid_0']['multi_logloss'][-1] < init_gbm.evals_result_['valid_0']['multi_logloss'][-1]
gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), init_model=init_gbm)
assert len(init_gbm.evals_result_["valid_0"]["multi_logloss"]) == len(gbm.evals_result_["valid_0"]["multi_logloss"])
assert len(init_gbm.evals_result_["valid_0"]["multi_logloss"]) == 5
assert gbm.evals_result_["valid_0"]["multi_logloss"][-1] < init_gbm.evals_result_["valid_0"]["multi_logloss"][-1]
def test_actual_number_of_trees():
......@@ -1288,20 +1282,16 @@ def test_sklearn_integration(estimator, check):
check(estimator)
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression'])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
pd = pytest.importorskip("pandas")
X, y, g = _create_data(task)
X = pd.DataFrame(X)
y_col_array = y.reshape(-1, 1)
params = {
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0
}
params = {"n_estimators": 1, "num_leaves": 3, "random_state": 0}
model_factory = task_to_model_factory[task]
with pytest.warns(UserWarning, match='column-vector'):
if task == 'ranking':
with pytest.warns(UserWarning, match="column-vector"):
if task == "ranking":
model_1d = model_factory(**params).fit(X, y, group=g)
model_2d = model_factory(**params).fit(X, y_col_array, group=g)
else:
......@@ -1313,12 +1303,12 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
np.testing.assert_array_equal(preds_1d, preds_2d)
@pytest.mark.parametrize('use_weight', [True, False])
@pytest.mark.parametrize("use_weight", [True, False])
def test_multiclass_custom_objective(use_weight):
centers = [[-4, -4], [4, 4], [-4, 4]]
X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
weight = np.full_like(y, 2) if use_weight else None
params = {'n_estimators': 10, 'num_leaves': 7}
params = {"n_estimators": 10, "num_leaves": 7}
builtin_obj_model = lgb.LGBMClassifier(**params)
builtin_obj_model.fit(X, y, sample_weight=weight)
builtin_obj_preds = builtin_obj_model.predict_proba(X)
......@@ -1332,11 +1322,11 @@ def test_multiclass_custom_objective(use_weight):
assert callable(custom_obj_model.objective_)
@pytest.mark.parametrize('use_weight', [True, False])
@pytest.mark.parametrize("use_weight", [True, False])
def test_multiclass_custom_eval(use_weight):
def custom_eval(y_true, y_pred, weight):
loss = log_loss(y_true, y_pred, sample_weight=weight)
return 'custom_logloss', loss, False
return "custom_logloss", loss, False
centers = [[-4, -4], [4, 4], [-4, 4]]
X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
......@@ -1348,27 +1338,25 @@ def test_multiclass_custom_eval(use_weight):
else:
weight_train = None
weight_valid = None
params = {'objective': 'multiclass', 'num_class': 3, 'num_leaves': 7}
params = {"objective": "multiclass", "num_class": 3, "num_leaves": 7}
model = lgb.LGBMClassifier(**params)
model.fit(
X_train,
y_train,
sample_weight=weight_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)],
eval_names=['train', 'valid'],
eval_names=["train", "valid"],
eval_sample_weight=[weight_train, weight_valid],
eval_metric=custom_eval,
)
eval_result = model.evals_result_
train_ds = (X_train, y_train, weight_train)
valid_ds = (X_valid, y_valid, weight_valid)
for key, (X, y_true, weight) in zip(['train', 'valid'], [train_ds, valid_ds]):
np.testing.assert_allclose(
eval_result[key]['multi_logloss'], eval_result[key]['custom_logloss']
)
for key, (X, y_true, weight) in zip(["train", "valid"], [train_ds, valid_ds]):
np.testing.assert_allclose(eval_result[key]["multi_logloss"], eval_result[key]["custom_logloss"])
y_pred = model.predict_proba(X)
_, metric_value, _ = custom_eval(y_true, y_pred, weight)
np.testing.assert_allclose(metric_value, eval_result[key]['custom_logloss'][-1])
np.testing.assert_allclose(metric_value, eval_result[key]["custom_logloss"][-1])
def test_negative_n_jobs(tmp_path):
......@@ -1397,21 +1385,21 @@ def test_default_n_jobs(tmp_path):
assert bool(re.search(rf"\[num_threads: {n_cores}\]", model_txt))
@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression'])
@pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
def test_validate_features(task):
X, y, g = _create_data(task, n_features=4)
features = ['x1', 'x2', 'x3', 'x4']
features = ["x1", "x2", "x3", "x4"]
df = pd_DataFrame(X, columns=features)
model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1)
if task == 'ranking':
if task == "ranking":
model.fit(df, y, group=g)
else:
model.fit(df, y)
assert model.feature_name_ == features
# try to predict with a different feature
df2 = df.rename(columns={'x2': 'z'})
df2 = df.rename(columns={"x2": "z"})
with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x2' at position 1 but found 'z'"):
model.predict(df2, validate_features=True)
......@@ -1419,59 +1407,59 @@ def test_validate_features(task):
model.predict(df2, validate_features=False)
@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame'])
@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_Series', 'pd_DataFrame'])
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'regression'])
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task):
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
pytest.skip('pandas is not installed')
pytest.skip("pandas is not installed")
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
pytest.skip("datatable is not installed")
X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(np.random.randn(y.shape[0]))
if task == 'binary-classification' or task == 'regression':
if task == "binary-classification" or task == "regression":
init_score = np.full_like(y, np.mean(y))
elif task == 'multiclass-classification':
elif task == "multiclass-classification":
init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
else:
raise ValueError(f"Unrecognized task '{task}'")
X_valid = X * 2
if X_type == 'dt_DataTable':
if X_type == "dt_DataTable":
X = dt_DataTable(X)
elif X_type == 'list2d':
elif X_type == "list2d":
X = X.tolist()
elif X_type == 'scipy_csc':
elif X_type == "scipy_csc":
X = scipy.sparse.csc_matrix(X)
elif X_type == 'scipy_csr':
elif X_type == "scipy_csr":
X = scipy.sparse.csr_matrix(X)
elif X_type == 'pd_DataFrame':
elif X_type == "pd_DataFrame":
X = pd_DataFrame(X)
elif X_type != 'numpy':
elif X_type != "numpy":
raise ValueError(f"Unrecognized X_type: '{X_type}'")
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d':
if y_type == "list1d":
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame':
elif y_type == "pd_DataFrame":
y = pd_DataFrame(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
if task == "multiclass-classification":
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type == 'pd_Series':
elif y_type == "pd_Series":
y = pd_Series(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
if task == "multiclass-classification":
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type != 'numpy':
elif y_type != "numpy":
raise ValueError(f"Unrecognized y_type: '{y_type}'")
model = task_to_model_factory[task](n_estimators=10, verbose=-1)
......@@ -1482,73 +1470,73 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
init_score=init_score,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score]
eval_init_score=[init_score],
)
preds = model.predict(X)
if task == 'binary-classification':
if task == "binary-classification":
assert accuracy_score(y, preds) >= 0.99
elif task == 'multiclass-classification':
elif task == "multiclass-classification":
assert accuracy_score(y, preds) >= 0.99
elif task == 'regression':
elif task == "regression":
assert r2_score(y, preds) > 0.86
else:
raise ValueError(f"Unrecognized task: '{task}'")
@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame'])
@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_DataFrame', 'pd_Series'])
@pytest.mark.parametrize('g_type', ['list1d_float', 'list1d_int', 'numpy', 'pd_Series'])
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type):
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
pytest.skip('pandas is not installed')
pytest.skip("pandas is not installed")
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task='ranking', n_samples=1_000)
pytest.skip("datatable is not installed")
X, y, g = _create_data(task="ranking", n_samples=1_000)
weights = np.abs(np.random.randn(y.shape[0]))
init_score = np.full_like(y, np.mean(y))
X_valid = X * 2
if X_type == 'dt_DataTable':
if X_type == "dt_DataTable":
X = dt_DataTable(X)
elif X_type == 'list2d':
elif X_type == "list2d":
X = X.tolist()
elif X_type == 'scipy_csc':
elif X_type == "scipy_csc":
X = scipy.sparse.csc_matrix(X)
elif X_type == 'scipy_csr':
elif X_type == "scipy_csr":
X = scipy.sparse.csr_matrix(X)
elif X_type == 'pd_DataFrame':
elif X_type == "pd_DataFrame":
X = pd_DataFrame(X)
elif X_type != 'numpy':
elif X_type != "numpy":
raise ValueError(f"Unrecognized X_type: '{X_type}'")
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d':
if y_type == "list1d":
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame':
elif y_type == "pd_DataFrame":
y = pd_DataFrame(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type == 'pd_Series':
elif y_type == "pd_Series":
y = pd_Series(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type != 'numpy':
elif y_type != "numpy":
raise ValueError(f"Unrecognized y_type: '{y_type}'")
if g_type == 'list1d_float':
if g_type == "list1d_float":
g = g.astype("float").tolist()
elif g_type == 'list1d_int':
elif g_type == "list1d_int":
g = g.astype("int").tolist()
elif g_type == 'pd_Series':
elif g_type == "pd_Series":
g = pd_Series(g)
elif g_type != 'numpy':
elif g_type != "numpy":
raise ValueError(f"Unrecognized g_type: '{g_type}'")
model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1)
model = task_to_model_factory["ranking"](n_estimators=10, verbose=-1)
model.fit(
X=X,
y=y,
......@@ -1558,7 +1546,7 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score],
eval_group=[g]
eval_group=[g],
)
preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99
......@@ -1570,7 +1558,7 @@ def test_classifier_fit_detects_classes_every_time():
ncols = 20
X = rng.standard_normal(size=(nrows, ncols))
y_bin = (rng.random(size=nrows) <= .3).astype(np.float64)
y_bin = (rng.random(size=nrows) <= 0.3).astype(np.float64)
y_multi = rng.integers(4, size=nrows)
model = lgb.LGBMClassifier(verbose=-1)
......
......@@ -10,7 +10,7 @@ import lightgbm as lgb
def test_register_logger(tmp_path):
logger = logging.getLogger("LightGBM")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(levelname)s | %(message)s')
formatter = logging.Formatter("%(levelname)s | %(message)s")
log_filename = tmp_path / "LightGBM_test_logger.log"
file_handler = logging.FileHandler(log_filename, mode="w", encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
......@@ -18,29 +18,27 @@ def test_register_logger(tmp_path):
logger.addHandler(file_handler)
def dummy_metric(_, __):
logger.debug('In dummy_metric')
return 'dummy_metric', 1, True
logger.debug("In dummy_metric")
return "dummy_metric", 1, True
lgb.register_logger(logger)
X = np.array([[1, 2, 3],
[1, 2, 4],
[1, 2, 4],
[1, 2, 3]],
dtype=np.float32)
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
y = np.array([0, 1, 1, 0])
lgb_train = lgb.Dataset(X, y)
lgb_valid = lgb.Dataset(X, y) # different object for early-stopping
eval_records = {}
callbacks = [
lgb.record_evaluation(eval_records),
lgb.log_evaluation(2),
lgb.early_stopping(10)
]
lgb.train({'objective': 'binary', 'metric': ['auc', 'binary_error']},
lgb_train, num_boost_round=10, feval=dummy_metric,
valid_sets=[lgb_valid], categorical_feature=[1], callbacks=callbacks)
callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)]
lgb.train(
{"objective": "binary", "metric": ["auc", "binary_error"]},
lgb_train,
num_boost_round=10,
feval=dummy_metric,
valid_sets=[lgb_valid],
categorical_feature=[1],
callbacks=callbacks,
)
lgb.plot_metric(eval_records)
......@@ -89,7 +87,7 @@ WARNING | More than one metric available, picking one to plot.
"INFO | [LightGBM] [Warning] GPU acceleration is disabled because no non-trivial dense features can be found",
"INFO | [LightGBM] [Warning] Using sparse features with CUDA is currently not supported.",
"INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
"INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!",
]
cuda_lines = [
"INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.",
......@@ -142,11 +140,7 @@ def test_register_custom_logger():
logged_messages.append(msg)
custom_logger = CustomLogger()
lgb.register_logger(
custom_logger,
info_method_name="custom_info",
warning_method_name="custom_warning"
)
lgb.register_logger(custom_logger, info_method_name="custom_info", warning_method_name="custom_warning")
lgb.basic._log_info("info message")
lgb.basic._log_warning("warning message")
......@@ -155,18 +149,14 @@ def test_register_custom_logger():
assert logged_messages == expected_log
logged_messages = []
X = np.array([[1, 2, 3],
[1, 2, 4],
[1, 2, 4],
[1, 2, 3]],
dtype=np.float32)
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
y = np.array([0, 1, 1, 0])
lgb_data = lgb.Dataset(X, y)
lgb.train(
{'objective': 'binary', 'metric': 'auc'},
{"objective": "binary", "metric": "auc"},
lgb_data,
num_boost_round=10,
valid_sets=[lgb_data],
categorical_feature=[1]
categorical_feature=[1],
)
assert logged_messages, "custom logger was not called"
......@@ -34,8 +34,9 @@ def load_linnerud(**kwargs):
return sklearn.datasets.load_linnerud(**kwargs)
def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
group=None, random_gs=False, avg_gs=10, random_state=0):
def make_ranking(
n_samples=100, n_features=20, n_informative=5, gmax=2, group=None, random_gs=False, avg_gs=10, random_state=0
):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
......@@ -81,7 +82,7 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
relvalues = range(gmax + 1)
# build y/target and group-id vectors with user-specified group sizes.
if group is not None and hasattr(group, '__len__'):
if group is not None and hasattr(group, "__len__"):
n_samples = np.sum(group)
for i, gsize in enumerate(group):
......@@ -116,8 +117,9 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
@lru_cache(maxsize=None)
def make_synthetic_regression(n_samples=100, n_features=4, n_informative=2, random_state=42):
return sklearn.datasets.make_regression(n_samples=n_samples, n_features=n_features,
n_informative=n_informative, random_state=random_state)
return sklearn.datasets.make_regression(
n_samples=n_samples, n_features=n_features, n_informative=n_informative, random_state=random_state
)
def dummy_obj(preds, train_data):
......@@ -126,7 +128,7 @@ def dummy_obj(preds, train_data):
def mse_obj(y_pred, dtrain):
y_true = dtrain.get_label()
grad = (y_pred - y_true)
grad = y_pred - y_true
hess = np.ones(len(grad))
return grad, hess
......@@ -157,50 +159,41 @@ def sklearn_multiclass_custom_objective(y_true, y_pred, weight=None):
def pickle_obj(obj, filepath, serializer):
if serializer == 'pickle':
with open(filepath, 'wb') as f:
if serializer == "pickle":
with open(filepath, "wb") as f:
pickle.dump(obj, f)
elif serializer == 'joblib':
elif serializer == "joblib":
joblib.dump(obj, filepath)
elif serializer == 'cloudpickle':
with open(filepath, 'wb') as f:
elif serializer == "cloudpickle":
with open(filepath, "wb") as f:
cloudpickle.dump(obj, f)
else:
raise ValueError(f'Unrecognized serializer type: {serializer}')
raise ValueError(f"Unrecognized serializer type: {serializer}")
def unpickle_obj(filepath, serializer):
if serializer == 'pickle':
with open(filepath, 'rb') as f:
if serializer == "pickle":
with open(filepath, "rb") as f:
return pickle.load(f)
elif serializer == 'joblib':
elif serializer == "joblib":
return joblib.load(filepath)
elif serializer == 'cloudpickle':
with open(filepath, 'rb') as f:
elif serializer == "cloudpickle":
with open(filepath, "rb") as f:
return cloudpickle.load(f)
else:
raise ValueError(f'Unrecognized serializer type: {serializer}')
raise ValueError(f"Unrecognized serializer type: {serializer}")
def pickle_and_unpickle_object(obj, serializer):
with lgb.basic._TempFile() as tmp_file:
pickle_obj(
obj=obj,
filepath=tmp_file.name,
serializer=serializer
)
obj_from_disk = unpickle_obj(
filepath=tmp_file.name,
serializer=serializer
)
pickle_obj(obj=obj, filepath=tmp_file.name, serializer=serializer)
obj_from_disk = unpickle_obj(filepath=tmp_file.name, serializer=serializer)
return obj_from_disk # noqa: RET504
# doing this here, at import time, to ensure it only runs once_per import
# instead of once per assertion
_numpy_testing_supports_strict_kwarg = (
"strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
)
_numpy_testing_supports_strict_kwarg = "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
def np_assert_array_equal(*args, **kwargs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment