test_sklearn.py 79.3 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
import itertools
3
import math
4
import re
5
from functools import partial
6
from os import getenv
7
from pathlib import Path
wxchan's avatar
wxchan committed
8

9
import joblib
wxchan's avatar
wxchan committed
10
import numpy as np
11
import pytest
12
13
import scipy.sparse
from scipy.stats import spearmanr
wxchan's avatar
wxchan committed
14
from sklearn.base import clone
15
from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
16
from sklearn.ensemble import StackingClassifier, StackingRegressor
17
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
18
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
19
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain
20
from sklearn.utils.estimator_checks import parametrize_with_checks as sklearn_parametrize_with_checks
21
from sklearn.utils.validation import check_is_fitted
wxchan's avatar
wxchan committed
22

23
import lightgbm as lgb
24
25
26
27
28
29
30
31
from lightgbm.compat import (
    DATATABLE_INSTALLED,
    PANDAS_INSTALLED,
    _sklearn_version,
    dt_DataTable,
    pd_DataFrame,
    pd_Series,
)
32

33
from .utils import (
34
    assert_silent,
35
36
37
38
39
40
41
42
43
    load_breast_cancer,
    load_digits,
    load_iris,
    load_linnerud,
    make_ranking,
    make_synthetic_regression,
    sklearn_multiclass_custom_objective,
    softmax,
)
44

45
46
47
SKLEARN_MAJOR, SKLEARN_MINOR, *_ = _sklearn_version.split(".")
SKLEARN_VERSION_GTE_1_6 = (int(SKLEARN_MAJOR), int(SKLEARN_MINOR)) >= (1, 6)

48
decreasing_generator = itertools.count(0, -1)
49
estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker)
50
task_to_model_factory = {
51
52
53
54
    "ranking": lgb.LGBMRanker,
    "binary-classification": lgb.LGBMClassifier,
    "multiclass-classification": lgb.LGBMClassifier,
    "regression": lgb.LGBMRegressor,
55
}
56
all_tasks = tuple(task_to_model_factory.keys())
57
58


59
def _create_data(task, n_samples=100, n_features=4):
60
    if task == "ranking":
61
        X, y, g = make_ranking(n_features=4, n_samples=n_samples)
62
        g = np.bincount(g)
63
64
    elif task.endswith("classification"):
        if task == "binary-classification":
65
            centers = 2
66
        elif task == "multiclass-classification":
67
68
            centers = 3
        else:
69
            raise ValueError(f"Unknown classification task '{task}'")
70
        X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42)
71
        g = None
72
    elif task == "regression":
73
        X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
74
75
        g = None
    return X, y, g
wxchan's avatar
wxchan committed
76

wxchan's avatar
wxchan committed
77

78
79
80
81
82
class UnpicklableCallback:
    def __reduce__(self):
        raise Exception("This class in not picklable")

    def __call__(self, env):
83
        env.model.attr_set_inside_callback = env.iteration * 10
84
85


86
def custom_asymmetric_obj(y_true, y_pred):
87
    residual = (y_true - y_pred).astype(np.float64)
88
89
90
91
92
    grad = np.where(residual < 0, -2 * 10.0 * residual, -2 * residual)
    hess = np.where(residual < 0, 2 * 10.0, 2.0)
    return grad, hess


93
def objective_ls(y_true, y_pred):
94
    grad = y_pred - y_true
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
    hess = np.ones(len(y_true))
    return grad, hess


def logregobj(y_true, y_pred):
    y_pred = 1.0 / (1.0 + np.exp(-y_pred))
    grad = y_pred - y_true
    hess = y_pred * (1.0 - y_pred)
    return grad, hess


def custom_dummy_obj(y_true, y_pred):
    return np.ones(y_true.shape), np.ones(y_true.shape)


def constant_metric(y_true, y_pred):
111
    return "error", 0, False
112
113
114


def decreasing_metric(y_true, y_pred):
115
    return ("decreasing_metric", next(decreasing_generator), False)
116
117


118
def mse(y_true, y_pred):
119
    return "custom MSE", mean_squared_error(y_true, y_pred), False
120
121


122
123
124
125
126
127
128
129
130
131
132
133
def binary_error(y_true, y_pred):
    return np.mean((y_pred > 0.5) != y_true)


def multi_error(y_true, y_pred):
    return np.mean(y_true != y_pred)


def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])


134
135
136
def test_binary():
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
137
    gbm = lgb.LGBMClassifier(n_estimators=50, verbose=-1)
138
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
139
140
    ret = log_loss(y_test, gbm.predict_proba(X_test))
    assert ret < 0.12
141
    assert gbm.evals_result_["valid_0"]["binary_logloss"][gbm.best_iteration_ - 1] == pytest.approx(ret)
142
143
144


def test_regression():
145
    X, y = make_synthetic_regression()
146
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
147
    gbm = lgb.LGBMRegressor(n_estimators=50, verbose=-1)
148
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
149
    ret = mean_squared_error(y_test, gbm.predict(X_test))
150
    assert ret < 174
151
    assert gbm.evals_result_["valid_0"]["l2"][gbm.best_iteration_ - 1] == pytest.approx(ret)
152
153


154
155
156
@pytest.mark.skipif(
    getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
157
158
159
def test_multiclass():
    X, y = load_digits(n_class=10, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
160
    gbm = lgb.LGBMClassifier(n_estimators=50, verbose=-1)
161
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
162
163
164
165
    ret = multi_error(y_test, gbm.predict(X_test))
    assert ret < 0.05
    ret = multi_logloss(y_test, gbm.predict_proba(X_test))
    assert ret < 0.16
166
    assert gbm.evals_result_["valid_0"]["multi_logloss"][gbm.best_iteration_ - 1] == pytest.approx(ret)
167
168


169
170
171
@pytest.mark.skipif(
    getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
172
def test_lambdarank():
173
174
175
176
177
    rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
    X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
    X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test"))
    q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
    q_test = np.loadtxt(str(rank_example_dir / "rank.test.query"))
178
    gbm = lgb.LGBMRanker(n_estimators=50)
179
180
181
182
183
184
185
    gbm.fit(
        X_train,
        y_train,
        group=q_train,
        eval_set=[(X_test, y_test)],
        eval_group=[q_test],
        eval_at=[1, 3],
186
        callbacks=[lgb.early_stopping(10), lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))],
187
    )
188
    assert gbm.best_iteration_ <= 24
189
190
    assert gbm.best_score_["valid_0"]["ndcg@1"] > 0.5674
    assert gbm.best_score_["valid_0"]["ndcg@3"] > 0.578
191
192
193


def test_xendcg():
194
195
196
197
198
199
    xendcg_example_dir = Path(__file__).absolute().parents[2] / "examples" / "xendcg"
    X_train, y_train = load_svmlight_file(str(xendcg_example_dir / "rank.train"))
    X_test, y_test = load_svmlight_file(str(xendcg_example_dir / "rank.test"))
    q_train = np.loadtxt(str(xendcg_example_dir / "rank.train.query"))
    q_test = np.loadtxt(str(xendcg_example_dir / "rank.test.query"))
    gbm = lgb.LGBMRanker(n_estimators=50, objective="rank_xendcg", random_state=5, n_jobs=1)
200
201
202
203
204
205
206
    gbm.fit(
        X_train,
        y_train,
        group=q_train,
        eval_set=[(X_test, y_test)],
        eval_group=[q_test],
        eval_at=[1, 3],
207
208
        eval_metric="ndcg",
        callbacks=[lgb.early_stopping(10), lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))],
209
    )
210
    assert gbm.best_iteration_ <= 24
211
212
    assert gbm.best_score_["valid_0"]["ndcg@1"] > 0.6211
    assert gbm.best_score_["valid_0"]["ndcg@3"] > 0.6253
213
214


215
def test_eval_at_aliases():
216
217
218
219
220
221
    rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
    X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
    X_test, y_test = load_svmlight_file(str(rank_example_dir / "rank.test"))
    q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
    q_test = np.loadtxt(str(rank_example_dir / "rank.test.query"))
    for alias in lgb.basic._ConfigAliases.get("eval_at"):
222
223
224
        gbm = lgb.LGBMRanker(n_estimators=5, **{alias: [1, 2, 3, 9]})
        with pytest.warns(UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'eval_at' argument"):
            gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], eval_group=[q_test])
225
        assert list(gbm.evals_result_["valid_0"].keys()) == ["ndcg@1", "ndcg@2", "ndcg@3", "ndcg@9"]
226
227


228
229
@pytest.mark.parametrize("custom_objective", [True, False])
def test_objective_aliases(custom_objective):
230
    X, y = make_synthetic_regression()
231
232
233
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    if custom_objective:
        obj = custom_dummy_obj
234
        metric_name = "l2"  # default one
235
    else:
236
237
        obj = "mape"
        metric_name = "mape"
238
    evals = []
239
    for alias in lgb.basic._ConfigAliases.get("objective"):
240
        gbm = lgb.LGBMRegressor(n_estimators=5, **{alias: obj})
241
242
243
244
        if alias != "objective":
            with pytest.warns(
                UserWarning, match=f"Found '{alias}' in params. Will use it instead of 'objective' argument"
            ):
245
246
247
                gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
        else:
            gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
248
249
        assert list(gbm.evals_result_["valid_0"].keys()) == [metric_name]
        evals.append(gbm.evals_result_["valid_0"][metric_name])
250
251
252
253
254
255
256
257
    evals_t = np.array(evals).T
    for i in range(evals_t.shape[0]):
        np.testing.assert_allclose(evals_t[i], evals_t[i][0])
    # check that really dummy objective was used and estimator didn't learn anything
    if custom_objective:
        np.testing.assert_allclose(evals_t, evals_t[0][0])


258
def test_regression_with_custom_objective():
259
    X, y = make_synthetic_regression()
260
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
261
    gbm = lgb.LGBMRegressor(n_estimators=50, verbose=-1, objective=objective_ls)
262
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
263
    ret = mean_squared_error(y_test, gbm.predict(X_test))
264
    assert ret < 174
265
    assert gbm.evals_result_["valid_0"]["l2"][gbm.best_iteration_ - 1] == pytest.approx(ret)
266
267
268
269
270


def test_binary_classification_with_custom_objective():
    X, y = load_digits(n_class=2, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
271
    gbm = lgb.LGBMClassifier(n_estimators=50, verbose=-1, objective=logregobj)
272
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(5)])
273
274
275
276
277
278
279
280
281
    # prediction result is actually not transformed (is raw) due to custom objective
    y_pred_raw = gbm.predict_proba(X_test)
    assert not np.all(y_pred_raw >= 0)
    y_pred = 1.0 / (1.0 + np.exp(-y_pred_raw))
    ret = binary_error(y_test, y_pred)
    assert ret < 0.05


def test_dart():
282
    X, y = make_synthetic_regression()
283
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
284
    gbm = lgb.LGBMRegressor(boosting_type="dart", n_estimators=50)
285
286
    gbm.fit(X_train, y_train)
    score = gbm.score(X_test, y_test)
287
    assert 0.8 <= score <= 1.0
288
289
290
291
292


def test_stacking_classifier():
    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
293
294
295
296
    classifiers = [("gbm1", lgb.LGBMClassifier(n_estimators=3)), ("gbm2", lgb.LGBMClassifier(n_estimators=3))]
    clf = StackingClassifier(
        estimators=classifiers, final_estimator=lgb.LGBMClassifier(n_estimators=3), passthrough=True
    )
297
298
299
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    assert score >= 0.8
300
    assert score <= 1.0
301
    assert clf.n_features_in_ == 4  # number of input features
302
303
    assert len(clf.named_estimators_["gbm1"].feature_importances_) == 4
    assert clf.named_estimators_["gbm1"].n_features_in_ == clf.named_estimators_["gbm2"].n_features_in_
304
305
    assert clf.final_estimator_.n_features_in_ == 10  # number of concatenated features
    assert len(clf.final_estimator_.feature_importances_) == 10
306
307
    assert all(clf.named_estimators_["gbm1"].classes_ == clf.named_estimators_["gbm2"].classes_)
    assert all(clf.classes_ == clf.named_estimators_["gbm1"].classes_)
308
309
310


def test_stacking_regressor():
311
312
313
    X, y = make_synthetic_regression(n_samples=200)
    n_features = X.shape[1]
    n_input_models = 2
314
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
315
316
    regressors = [("gbm1", lgb.LGBMRegressor(n_estimators=3)), ("gbm2", lgb.LGBMRegressor(n_estimators=3))]
    reg = StackingRegressor(estimators=regressors, final_estimator=lgb.LGBMRegressor(n_estimators=3), passthrough=True)
317
318
319
    reg.fit(X_train, y_train)
    score = reg.score(X_test, y_test)
    assert score >= 0.2
320
    assert score <= 1.0
321
    assert reg.n_features_in_ == n_features  # number of input features
322
323
    assert len(reg.named_estimators_["gbm1"].feature_importances_) == n_features
    assert reg.named_estimators_["gbm1"].n_features_in_ == reg.named_estimators_["gbm2"].n_features_in_
324
325
    assert reg.final_estimator_.n_features_in_ == n_features + n_input_models  # number of concatenated features
    assert len(reg.final_estimator_.feature_importances_) == n_features + n_input_models
326
327
328
329
330


def test_grid_search():
    X, y = load_iris(return_X_y=True)
    y = y.astype(str)  # utilize label encoder at it's max power
331
332
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
333
334
    params = {"subsample": 0.8, "subsample_freq": 1}
    grid_params = {"boosting_type": ["rf", "gbdt"], "n_estimators": [4, 6], "reg_alpha": [0.01, 0.005]}
335
    evals_result = {}
336
337
338
    fit_params = {
        "eval_set": [(X_val, y_val)],
        "eval_metric": constant_metric,
339
        "callbacks": [lgb.early_stopping(2), lgb.record_evaluation(evals_result)],
340
    }
341
    grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
342
343
    grid.fit(X_train, y_train, **fit_params)
    score = grid.score(X_test, y_test)  # utilizes GridSearchCV default refit=True
344
345
346
347
    assert grid.best_params_["boosting_type"] in ["rf", "gbdt"]
    assert grid.best_params_["n_estimators"] in [4, 6]
    assert grid.best_params_["reg_alpha"] in [0.01, 0.005]
    assert grid.best_score_ <= 1.0
348
    assert grid.best_estimator_.best_iteration_ == 1
349
350
    assert grid.best_estimator_.best_score_["valid_0"]["multi_logloss"] < 0.25
    assert grid.best_estimator_.best_score_["valid_0"]["error"] == 0
351
    assert score >= 0.2
352
    assert score <= 1.0
353
    assert evals_result == grid.best_estimator_.evals_result_
354
355


356
def test_random_search(rng):
357
358
    X, y = load_iris(return_X_y=True)
    y = y.astype(str)  # utilize label encoder at it's max power
359
360
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
361
    n_iter = 3  # Number of samples
362
    params = {"subsample": 0.8, "subsample_freq": 1}
363
    param_dist = {
364
        "boosting_type": ["rf", "gbdt"],
365
366
        "n_estimators": rng.integers(low=3, high=10, size=(n_iter,)).tolist(),
        "reg_alpha": rng.uniform(low=0.01, high=0.06, size=(n_iter,)).tolist(),
367
    }
368
369
370
371
    fit_params = {"eval_set": [(X_val, y_val)], "eval_metric": constant_metric, "callbacks": [lgb.early_stopping(2)]}
    rand = RandomizedSearchCV(
        estimator=lgb.LGBMClassifier(**params), param_distributions=param_dist, cv=2, n_iter=n_iter, random_state=42
    )
372
373
    rand.fit(X_train, y_train, **fit_params)
    score = rand.score(X_test, y_test)  # utilizes RandomizedSearchCV default refit=True
374
375
376
377
378
379
380
    assert rand.best_params_["boosting_type"] in ["rf", "gbdt"]
    assert rand.best_params_["n_estimators"] in list(range(3, 10))
    assert rand.best_params_["reg_alpha"] >= 0.01  # Left-closed boundary point
    assert rand.best_params_["reg_alpha"] <= 0.06  # Right-closed boundary point
    assert rand.best_score_ <= 1.0
    assert rand.best_estimator_.best_score_["valid_0"]["multi_logloss"] < 0.25
    assert rand.best_estimator_.best_score_["valid_0"]["error"] == 0
381
    assert score >= 0.2
382
    assert score <= 1.0
383
384
385
386


def test_multioutput_classifier():
    n_outputs = 3
387
    X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=n_outputs, random_state=0)
388
    y = y.astype(str)  # utilize label encoder at it's max power
389
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
390
391
392
393
    clf = MultiOutputClassifier(estimator=lgb.LGBMClassifier(n_estimators=10))
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    assert score >= 0.2
394
395
    assert score <= 1.0
    np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
396
397
398
399
400
401
402
    for classifier in clf.estimators_:
        assert isinstance(classifier, lgb.LGBMClassifier)
        assert isinstance(classifier.booster_, lgb.Booster)


def test_multioutput_regressor():
    bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
403
404
    X, y = bunch["data"], bunch["target"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
405
406
407
408
409
    reg = MultiOutputRegressor(estimator=lgb.LGBMRegressor(n_estimators=10))
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    _, score, _ = mse(y_test, y_pred)
    assert score >= 0.2
410
    assert score <= 120.0
411
412
413
414
415
416
417
    for regressor in reg.estimators_:
        assert isinstance(regressor, lgb.LGBMRegressor)
        assert isinstance(regressor.booster_, lgb.Booster)


def test_classifier_chain():
    n_outputs = 3
418
419
    X, y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=n_outputs, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
420
    order = [2, 0, 1]
421
    clf = ClassifierChain(base_estimator=lgb.LGBMClassifier(n_estimators=10), order=order, random_state=42)
422
423
424
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    assert score >= 0.2
425
426
    assert score <= 1.0
    np.testing.assert_array_equal(np.tile(np.unique(y_train), n_outputs), np.concatenate(clf.classes_))
427
428
429
430
431
432
433
434
    assert order == clf.order_
    for classifier in clf.estimators_:
        assert isinstance(classifier, lgb.LGBMClassifier)
        assert isinstance(classifier.booster_, lgb.Booster)


def test_regressor_chain():
    bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
435
    X, y = bunch["data"], bunch["target"]
436
437
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    order = [2, 0, 1]
438
    reg = RegressorChain(base_estimator=lgb.LGBMRegressor(n_estimators=10), order=order, random_state=42)
439
440
441
442
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    _, score, _ = mse(y_test, y_pred)
    assert score >= 0.2
443
    assert score <= 120.0
444
445
446
447
448
449
450
    assert order == reg.order_
    for regressor in reg.estimators_:
        assert isinstance(regressor, lgb.LGBMRegressor)
        assert isinstance(regressor.booster_, lgb.Booster)


def test_clone_and_property():
451
    X, y = make_synthetic_regression()
452
    gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
453
    gbm.fit(X, y)
454
455

    gbm_clone = clone(gbm)
456
457
458
459

    # original estimator is unaffected
    assert gbm.n_estimators == 10
    assert gbm.verbose == -1
460
461
462
    assert isinstance(gbm.booster_, lgb.Booster)
    assert isinstance(gbm.feature_importances_, np.ndarray)

463
464
465
466
467
468
    # new estimator is unfitted, but has the same parameters
    assert gbm_clone.__sklearn_is_fitted__() is False
    assert gbm_clone.n_estimators == 10
    assert gbm_clone.verbose == -1
    assert gbm_clone.get_params() == gbm.get_params()

469
    X, y = load_digits(n_class=2, return_X_y=True)
470
    clf = lgb.LGBMClassifier(n_estimators=10, verbose=-1)
471
    clf.fit(X, y)
472
473
474
475
476
477
    assert sorted(clf.classes_) == [0, 1]
    assert clf.n_classes_ == 2
    assert isinstance(clf.booster_, lgb.Booster)
    assert isinstance(clf.feature_importances_, np.ndarray)


478
def test_joblib(tmp_path):
479
    X, y = make_synthetic_regression()
480
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
481
    gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj, verbose=-1, importance_type="split")
482
483
484
    gbm.fit(
        X_train,
        y_train,
485
        eval_set=[(X_train, y_train), (X_test, y_test)],
486
        eval_metric=mse,
487
        callbacks=[lgb.early_stopping(5), lgb.reset_parameter(learning_rate=list(np.arange(1, 0, -0.1)))],
488
    )
489
490
491
    model_path_pkl = str(tmp_path / "lgb.pkl")
    joblib.dump(gbm, model_path_pkl)  # test model with custom functions
    gbm_pickle = joblib.load(model_path_pkl)
492
493
494
495
496
497
498
499
    assert isinstance(gbm_pickle.booster_, lgb.Booster)
    assert gbm.get_params() == gbm_pickle.get_params()
    np.testing.assert_array_equal(gbm.feature_importances_, gbm_pickle.feature_importances_)
    assert gbm_pickle.learning_rate == pytest.approx(0.1)
    assert callable(gbm_pickle.objective)

    for eval_set in gbm.evals_result_:
        for metric in gbm.evals_result_[eval_set]:
500
            np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], gbm_pickle.evals_result_[eval_set][metric])
501
502
503
504
505
    pred_origin = gbm.predict(X_test)
    pred_pickle = gbm_pickle.predict(X_test)
    np.testing.assert_allclose(pred_origin, pred_pickle)


506
507
508
509
def test_non_serializable_objects_in_callbacks(tmp_path):
    unpicklable_callback = UnpicklableCallback()

    with pytest.raises(Exception, match="This class in not picklable"):
510
        joblib.dump(unpicklable_callback, tmp_path / "tmp.joblib")
511

512
    X, y = make_synthetic_regression()
513
514
    gbm = lgb.LGBMRegressor(n_estimators=5)
    gbm.fit(X, y, callbacks=[unpicklable_callback])
515
    assert gbm.booster_.attr_set_inside_callback == 40
516
517


518
519
@pytest.mark.parametrize("rng_constructor", [np.random.RandomState, np.random.default_rng])
def test_random_state_object(rng_constructor):
520
521
    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
522
523
    state1 = rng_constructor(123)
    state2 = rng_constructor(123)
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
    clf1 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state1)
    clf2 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state2)
    # Test if random_state is properly stored
    assert clf1.random_state is state1
    assert clf2.random_state is state2
    # Test if two random states produce identical models
    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    y_pred1 = clf1.predict(X_test, raw_score=True)
    y_pred2 = clf2.predict(X_test, raw_score=True)
    np.testing.assert_allclose(y_pred1, y_pred2)
    np.testing.assert_array_equal(clf1.feature_importances_, clf2.feature_importances_)
    df1 = clf1.booster_.model_to_string(num_iteration=0)
    df2 = clf2.booster_.model_to_string(num_iteration=0)
    assert df1 == df2
    # Test if subsequent fits sample from random_state object and produce different models
    clf1.fit(X_train, y_train)
    y_pred1_refit = clf1.predict(X_test, raw_score=True)
    df3 = clf1.booster_.model_to_string(num_iteration=0)
    assert clf1.random_state is state1
    assert clf2.random_state is state2
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(y_pred1, y_pred1_refit)
    assert df1 != df3


def test_feature_importances_single_leaf():
    data = load_iris(return_X_y=False)
    clf = lgb.LGBMClassifier(n_estimators=10)
    clf.fit(data.data, data.target)
    importances = clf.feature_importances_
    assert len(importances) == 4


def test_feature_importances_type():
    data = load_iris(return_X_y=False)
    clf = lgb.LGBMClassifier(n_estimators=10)
    clf.fit(data.data, data.target)
562
    clf.set_params(importance_type="split")
563
    importances_split = clf.feature_importances_
564
    clf.set_params(importance_type="gain")
565
566
567
568
569
570
571
    importances_gain = clf.feature_importances_
    # Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
    importance_split_top1 = sorted(importances_split, reverse=True)[0]
    importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
    assert importance_split_top1 != importance_gain_top1


572
573
# why fixed seed?
# sometimes there is no difference how cols are treated (cat or not cat)
574
def test_pandas_categorical(rng_fixed_seed, tmp_path):
575
    pd = pytest.importorskip("pandas")
576
577
    X = pd.DataFrame(
        {
578
579
580
581
582
            "A": rng_fixed_seed.permutation(["a", "b", "c", "d"] * 75),  # str
            "B": rng_fixed_seed.permutation([1, 2, 3] * 100),  # int
            "C": rng_fixed_seed.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
            "D": rng_fixed_seed.permutation([True, False] * 150),  # bool
            "E": pd.Categorical(rng_fixed_seed.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
583
584
        }
    )  # str and ordered categorical
585
    y = rng_fixed_seed.permutation([0, 1] * 150)
586
587
    X_test = pd.DataFrame(
        {
588
589
590
591
592
            "A": rng_fixed_seed.permutation(["a", "b", "e"] * 20),  # unseen category
            "B": rng_fixed_seed.permutation([1, 3] * 30),
            "C": rng_fixed_seed.permutation([0.1, -0.1, 0.2, 0.2] * 15),
            "D": rng_fixed_seed.permutation([True, False] * 30),
            "E": pd.Categorical(rng_fixed_seed.permutation(["z", "y"] * 30), ordered=True),
593
594
        }
    )
595
596
    cat_cols_actual = ["A", "B", "C", "D"]
    cat_cols_to_store = cat_cols_actual + ["E"]
597
598
    X[cat_cols_actual] = X[cat_cols_actual].astype("category")
    X_test[cat_cols_actual] = X_test[cat_cols_actual].astype("category")
599
600
601
602
603
604
    cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
    gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
    pred0 = gbm0.predict(X_test, raw_score=True)
    pred_prob = gbm0.predict_proba(X_test)[:, 1]
    gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
    pred1 = gbm1.predict(X_test, raw_score=True)
605
    gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A"])
606
    pred2 = gbm2.predict(X_test, raw_score=True)
607
    gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A", "B", "C", "D"])
608
    pred3 = gbm3.predict(X_test, raw_score=True)
609
610
611
    categorical_model_path = tmp_path / "categorical.model"
    gbm3.booster_.save_model(categorical_model_path)
    gbm4 = lgb.Booster(model_file=categorical_model_path)
612
    pred4 = gbm4.predict(X_test)
613
    gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=["A", "B", "C", "D", "E"])
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
    pred5 = gbm5.predict(X_test, raw_score=True)
    gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
    pred6 = gbm6.predict(X_test, raw_score=True)
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(pred0, pred1)
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(pred0, pred2)
    np.testing.assert_allclose(pred1, pred2)
    np.testing.assert_allclose(pred0, pred3)
    np.testing.assert_allclose(pred_prob, pred4)
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(pred0, pred5)  # ordered cat features aren't treated as cat features by default
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(pred0, pred6)
    assert gbm0.booster_.pandas_categorical == cat_values
    assert gbm1.booster_.pandas_categorical == cat_values
    assert gbm2.booster_.pandas_categorical == cat_values
    assert gbm3.booster_.pandas_categorical == cat_values
    assert gbm4.pandas_categorical == cat_values
    assert gbm5.booster_.pandas_categorical == cat_values
    assert gbm6.booster_.pandas_categorical == cat_values


637
def test_pandas_sparse(rng):
638
    pd = pytest.importorskip("pandas")
639
640
    X = pd.DataFrame(
        {
641
642
643
            "A": pd.arrays.SparseArray(rng.permutation([0, 1, 2] * 100)),
            "B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
            "C": pd.arrays.SparseArray(rng.permutation([True, False] * 150)),
644
645
        }
    )
646
    y = pd.Series(pd.arrays.SparseArray(rng.permutation([0, 1] * 150)))
647
648
    X_test = pd.DataFrame(
        {
649
650
651
            "A": pd.arrays.SparseArray(rng.permutation([0, 2] * 30)),
            "B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
            "C": pd.arrays.SparseArray(rng.permutation([True, False] * 30)),
652
653
        }
    )
654
    for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
655
        assert isinstance(dtype, pd.SparseDtype)
656
657
    gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
    pred_sparse = gbm.predict(X_test, raw_score=True)
658
    if hasattr(X_test, "sparse"):
659
660
661
662
663
664
665
666
667
        pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
    else:
        pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
    np.testing.assert_allclose(pred_sparse, pred_dense)


def test_predict():
    # With default params
    iris = load_iris(return_X_y=False)
668
    X_train, X_test, y_train, _ = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
669

670
    gbm = lgb.train({"objective": "multiclass", "num_class": 3, "verbose": -1}, lgb.Dataset(X_train, y_train))
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
    clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train)

    # Tests same probabilities
    res_engine = gbm.predict(X_test)
    res_sklearn = clf.predict_proba(X_test)
    np.testing.assert_allclose(res_engine, res_sklearn)

    # Tests same predictions
    res_engine = np.argmax(gbm.predict(X_test), axis=1)
    res_sklearn = clf.predict(X_test)
    np.testing.assert_equal(res_engine, res_sklearn)

    # Tests same raw scores
    res_engine = gbm.predict(X_test, raw_score=True)
    res_sklearn = clf.predict(X_test, raw_score=True)
    np.testing.assert_allclose(res_engine, res_sklearn)

    # Tests same leaf indices
    res_engine = gbm.predict(X_test, pred_leaf=True)
    res_sklearn = clf.predict(X_test, pred_leaf=True)
    np.testing.assert_equal(res_engine, res_sklearn)

    # Tests same feature contributions
    res_engine = gbm.predict(X_test, pred_contrib=True)
    res_sklearn = clf.predict(X_test, pred_contrib=True)
    np.testing.assert_allclose(res_engine, res_sklearn)

    # Tests other parameters for the prediction works
    res_engine = gbm.predict(X_test)
700
    res_sklearn_params = clf.predict_proba(X_test, pred_early_stop=True, pred_early_stop_margin=1.0)
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(res_engine, res_sklearn_params)

    # Tests start_iteration
    # Tests same probabilities, starting from iteration 10
    res_engine = gbm.predict(X_test, start_iteration=10)
    res_sklearn = clf.predict_proba(X_test, start_iteration=10)
    np.testing.assert_allclose(res_engine, res_sklearn)

    # Tests same predictions, starting from iteration 10
    res_engine = np.argmax(gbm.predict(X_test, start_iteration=10), axis=1)
    res_sklearn = clf.predict(X_test, start_iteration=10)
    np.testing.assert_equal(res_engine, res_sklearn)

    # Tests same raw scores, starting from iteration 10
    res_engine = gbm.predict(X_test, raw_score=True, start_iteration=10)
    res_sklearn = clf.predict(X_test, raw_score=True, start_iteration=10)
    np.testing.assert_allclose(res_engine, res_sklearn)

    # Tests same leaf indices, starting from iteration 10
    res_engine = gbm.predict(X_test, pred_leaf=True, start_iteration=10)
    res_sklearn = clf.predict(X_test, pred_leaf=True, start_iteration=10)
    np.testing.assert_equal(res_engine, res_sklearn)

    # Tests same feature contributions, starting from iteration 10
    res_engine = gbm.predict(X_test, pred_contrib=True, start_iteration=10)
    res_sklearn = clf.predict(X_test, pred_contrib=True, start_iteration=10)
    np.testing.assert_allclose(res_engine, res_sklearn)

    # Tests other parameters for the prediction works, starting from iteration 10
    res_engine = gbm.predict(X_test, start_iteration=10)
732
    res_sklearn_params = clf.predict_proba(X_test, pred_early_stop=True, pred_early_stop_margin=1.0, start_iteration=10)
733
734
735
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(res_engine, res_sklearn_params)

736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
    # Test multiclass binary classification
    num_samples = 100
    num_classes = 2
    X_train = np.linspace(start=0, stop=10, num=num_samples * 3).reshape(num_samples, 3)
    y_train = np.concatenate([np.zeros(int(num_samples / 2 - 10)), np.ones(int(num_samples / 2 + 10))])

    gbm = lgb.train({"objective": "multiclass", "num_class": num_classes, "verbose": -1}, lgb.Dataset(X_train, y_train))
    clf = lgb.LGBMClassifier(objective="multiclass", num_classes=num_classes).fit(X_train, y_train)

    res_engine = gbm.predict(X_train)
    res_sklearn = clf.predict_proba(X_train)

    assert res_engine.shape == (num_samples, num_classes)
    assert res_sklearn.shape == (num_samples, num_classes)
    np.testing.assert_allclose(res_engine, res_sklearn)

    res_class_sklearn = clf.predict(X_train)
    np.testing.assert_allclose(res_class_sklearn, y_train)

755

756
757
758
759
def test_predict_with_params_from_init():
    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)

760
    predict_params = {"pred_early_stop": True, "pred_early_stop_margin": 1.0}
761

762
    y_preds_no_params = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(X_test, raw_score=True)
763

764
765
766
    y_preds_params_in_predict = (
        lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train).predict(X_test, raw_score=True, **predict_params)
    )
767
768
769
    with pytest.raises(AssertionError):
        np.testing.assert_allclose(y_preds_no_params, y_preds_params_in_predict)

770
771
772
773
774
775
    y_preds_params_in_set_params_before_fit = (
        lgb.LGBMClassifier(verbose=-1)
        .set_params(**predict_params)
        .fit(X_train, y_train)
        .predict(X_test, raw_score=True)
    )
776
777
    np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_before_fit)

778
779
780
781
782
783
    y_preds_params_in_set_params_after_fit = (
        lgb.LGBMClassifier(verbose=-1)
        .fit(X_train, y_train)
        .set_params(**predict_params)
        .predict(X_test, raw_score=True)
    )
784
785
    np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_set_params_after_fit)

786
787
788
    y_preds_params_in_init = (
        lgb.LGBMClassifier(verbose=-1, **predict_params).fit(X_train, y_train).predict(X_test, raw_score=True)
    )
789
790
791
    np.testing.assert_allclose(y_preds_params_in_predict, y_preds_params_in_init)

    # test that params passed in predict have higher priority
792
793
794
795
796
    y_preds_params_overwritten = (
        lgb.LGBMClassifier(verbose=-1, **predict_params)
        .fit(X_train, y_train)
        .predict(X_test, raw_score=True, pred_early_stop=False)
    )
797
798
799
    np.testing.assert_allclose(y_preds_no_params, y_preds_params_overwritten)


800
def test_evaluate_train_set():
801
    X, y = make_synthetic_regression()
802
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
803
    gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
804
    gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
805
    assert len(gbm.evals_result_) == 2
806
807
808
809
810
811
    assert "training" in gbm.evals_result_
    assert len(gbm.evals_result_["training"]) == 1
    assert "l2" in gbm.evals_result_["training"]
    assert "valid_1" in gbm.evals_result_
    assert len(gbm.evals_result_["valid_1"]) == 1
    assert "l2" in gbm.evals_result_["valid_1"]
812
813
814


def test_metrics():
815
816
    X, y = make_synthetic_regression()
    y = abs(y)
817
818
    params = {"n_estimators": 2, "verbose": -1}
    params_fit = {"X": X, "y": y, "eval_set": (X, y)}
819
820
821
822

    # no custom objective, no custom metric
    # default metric
    gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
823
824
    assert len(gbm.evals_result_["training"]) == 1
    assert "l2" in gbm.evals_result_["training"]
825
826

    # non-default metric
827
828
829
    gbm = lgb.LGBMRegressor(metric="mape", **params).fit(**params_fit)
    assert len(gbm.evals_result_["training"]) == 1
    assert "mape" in gbm.evals_result_["training"]
830
831

    # no metric
832
    gbm = lgb.LGBMRegressor(metric="None", **params).fit(**params_fit)
833
    assert gbm.evals_result_ == {}
834
835

    # non-default metric in eval_metric
836
837
838
839
    gbm = lgb.LGBMRegressor(**params).fit(eval_metric="mape", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "l2" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
840
841

    # non-default metric with non-default metric in eval_metric
842
843
844
845
    gbm = lgb.LGBMRegressor(metric="gamma", **params).fit(eval_metric="mape", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "gamma" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
846
847

    # non-default metric with multiple metrics in eval_metric
848
849
850
851
852
    gbm = lgb.LGBMRegressor(metric="gamma", **params).fit(eval_metric=["l2", "mape"], **params_fit)
    assert len(gbm.evals_result_["training"]) == 3
    assert "gamma" in gbm.evals_result_["training"]
    assert "l2" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
853
854
855

    # non-default metric with multiple metrics in eval_metric for LGBMClassifier
    X_classification, y_classification = load_breast_cancer(return_X_y=True)
856
857
858
859
860
861
862
863
864
865
866
    params_classification = {"n_estimators": 2, "verbose": -1, "objective": "binary", "metric": "binary_logloss"}
    params_fit_classification = {
        "X": X_classification,
        "y": y_classification,
        "eval_set": (X_classification, y_classification),
    }
    gbm = lgb.LGBMClassifier(**params_classification).fit(eval_metric=["fair", "error"], **params_fit_classification)
    assert len(gbm.evals_result_["training"]) == 3
    assert "fair" in gbm.evals_result_["training"]
    assert "binary_error" in gbm.evals_result_["training"]
    assert "binary_logloss" in gbm.evals_result_["training"]
867
868

    # default metric for non-default objective
869
870
871
    gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(**params_fit)
    assert len(gbm.evals_result_["training"]) == 1
    assert "l1" in gbm.evals_result_["training"]
872
873

    # non-default metric for non-default objective
874
875
876
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric="mape", **params).fit(**params_fit)
    assert len(gbm.evals_result_["training"]) == 1
    assert "mape" in gbm.evals_result_["training"]
877
878

    # no metric
879
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric="None", **params).fit(**params_fit)
880
    assert gbm.evals_result_ == {}
881
882

    # non-default metric in eval_metric for non-default objective
883
884
885
886
    gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(eval_metric="mape", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "l1" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
887
888

    # non-default metric with non-default metric in eval_metric for non-default objective
889
890
891
892
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric="gamma", **params).fit(eval_metric="mape", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "gamma" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
893
894

    # non-default metric with multiple metrics in eval_metric for non-default objective
895
896
897
898
899
900
901
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric="gamma", **params).fit(
        eval_metric=["l2", "mape"], **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 3
    assert "gamma" in gbm.evals_result_["training"]
    assert "l2" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
902
903
904
905

    # custom objective, no custom metric
    # default regression metric for custom objective
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(**params_fit)
906
907
    assert len(gbm.evals_result_["training"]) == 1
    assert "l2" in gbm.evals_result_["training"]
908
909

    # non-default regression metric for custom objective
910
911
912
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(**params_fit)
    assert len(gbm.evals_result_["training"]) == 1
    assert "mape" in gbm.evals_result_["training"]
913
914

    # multiple regression metrics for custom objective
915
916
917
918
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(**params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "l1" in gbm.evals_result_["training"]
    assert "gamma" in gbm.evals_result_["training"]
919
920

    # no metric
921
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="None", **params).fit(**params_fit)
922
    assert gbm.evals_result_ == {}
923
924

    # default regression metric with non-default metric in eval_metric for custom objective
925
926
927
928
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(eval_metric="mape", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "l2" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
929
930

    # non-default regression metric with metric in eval_metric for custom objective
931
932
933
934
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(eval_metric="gamma", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "mape" in gbm.evals_result_["training"]
    assert "gamma" in gbm.evals_result_["training"]
935
936

    # multiple regression metrics with metric in eval_metric for custom objective
937
938
939
940
941
942
943
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(
        eval_metric="l2", **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 3
    assert "l1" in gbm.evals_result_["training"]
    assert "gamma" in gbm.evals_result_["training"]
    assert "l2" in gbm.evals_result_["training"]
944
945

    # multiple regression metrics with multiple metrics in eval_metric for custom objective
946
947
948
949
950
951
952
953
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l1", "gamma"], **params).fit(
        eval_metric=["l2", "mape"], **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 4
    assert "l1" in gbm.evals_result_["training"]
    assert "gamma" in gbm.evals_result_["training"]
    assert "l2" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
954
955
956
957

    # no custom objective, custom metric
    # default metric with custom metric
    gbm = lgb.LGBMRegressor(**params).fit(eval_metric=constant_metric, **params_fit)
958
959
960
    assert len(gbm.evals_result_["training"]) == 2
    assert "l2" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
961
962

    # non-default metric with custom metric
963
964
965
966
    gbm = lgb.LGBMRegressor(metric="mape", **params).fit(eval_metric=constant_metric, **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "mape" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
967
968

    # multiple metrics with custom metric
969
970
971
972
973
    gbm = lgb.LGBMRegressor(metric=["l1", "gamma"], **params).fit(eval_metric=constant_metric, **params_fit)
    assert len(gbm.evals_result_["training"]) == 3
    assert "l1" in gbm.evals_result_["training"]
    assert "gamma" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
974
975

    # custom metric (disable default metric)
976
977
978
    gbm = lgb.LGBMRegressor(metric="None", **params).fit(eval_metric=constant_metric, **params_fit)
    assert len(gbm.evals_result_["training"]) == 1
    assert "error" in gbm.evals_result_["training"]
979
980

    # default metric for non-default objective with custom metric
981
982
983
984
    gbm = lgb.LGBMRegressor(objective="regression_l1", **params).fit(eval_metric=constant_metric, **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "l1" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
985
986

    # non-default metric for non-default objective with custom metric
987
988
989
990
991
992
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric="mape", **params).fit(
        eval_metric=constant_metric, **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 2
    assert "mape" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
993
994

    # multiple metrics for non-default objective with custom metric
995
996
997
998
999
1000
1001
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric=["l1", "gamma"], **params).fit(
        eval_metric=constant_metric, **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 3
    assert "l1" in gbm.evals_result_["training"]
    assert "gamma" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
1002
1003

    # custom metric (disable default metric for non-default objective)
1004
1005
1006
1007
1008
    gbm = lgb.LGBMRegressor(objective="regression_l1", metric="None", **params).fit(
        eval_metric=constant_metric, **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 1
    assert "error" in gbm.evals_result_["training"]
1009
1010
1011

    # custom objective, custom metric
    # custom metric for custom objective
1012
1013
1014
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, **params).fit(eval_metric=constant_metric, **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "error" in gbm.evals_result_["training"]
1015
1016

    # non-default regression metric with custom metric for custom objective
1017
1018
1019
1020
1021
1022
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric="mape", **params).fit(
        eval_metric=constant_metric, **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 2
    assert "mape" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
1023
1024

    # multiple regression metrics with custom metric for custom objective
1025
1026
1027
1028
1029
1030
1031
    gbm = lgb.LGBMRegressor(objective=custom_dummy_obj, metric=["l2", "mape"], **params).fit(
        eval_metric=constant_metric, **params_fit
    )
    assert len(gbm.evals_result_["training"]) == 3
    assert "l2" in gbm.evals_result_["training"]
    assert "mape" in gbm.evals_result_["training"]
    assert "error" in gbm.evals_result_["training"]
1032
1033

    X, y = load_digits(n_class=3, return_X_y=True)
1034
    params_fit = {"X": X, "y": y, "eval_set": (X, y)}
1035
1036

    # default metric and invalid binary metric is replaced with multiclass alternative
1037
1038
1039
1040
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric="binary_error", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "multi_logloss" in gbm.evals_result_["training"]
    assert "multi_error" in gbm.evals_result_["training"]
1041

1042
    # invalid binary metric is replaced with multiclass alternative
1043
1044
1045
1046
1047
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric="binary_error", **params_fit)
    assert gbm.objective_ == "multiclass"
    assert len(gbm.evals_result_["training"]) == 2
    assert "multi_logloss" in gbm.evals_result_["training"]
    assert "multi_error" in gbm.evals_result_["training"]
1048
1049
1050

    # default metric for non-default multiclass objective
    # and invalid binary metric is replaced with multiclass alternative
1051
1052
1053
1054
1055
    gbm = lgb.LGBMClassifier(objective="ovr", **params).fit(eval_metric="binary_error", **params_fit)
    assert gbm.objective_ == "ovr"
    assert len(gbm.evals_result_["training"]) == 2
    assert "multi_logloss" in gbm.evals_result_["training"]
    assert "multi_error" in gbm.evals_result_["training"]
1056
1057

    X, y = load_digits(n_class=2, return_X_y=True)
1058
    params_fit = {"X": X, "y": y, "eval_set": (X, y)}
1059
1060

    # default metric and invalid multiclass metric is replaced with binary alternative
1061
1062
1063
1064
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric="multi_error", **params_fit)
    assert len(gbm.evals_result_["training"]) == 2
    assert "binary_logloss" in gbm.evals_result_["training"]
    assert "binary_error" in gbm.evals_result_["training"]
1065
1066

    # invalid multiclass metric is replaced with binary alternative for custom objective
1067
1068
1069
    gbm = lgb.LGBMClassifier(objective=custom_dummy_obj, **params).fit(eval_metric="multi_logloss", **params_fit)
    assert len(gbm.evals_result_["training"]) == 1
    assert "binary_logloss" in gbm.evals_result_["training"]
1070

1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
    # the evaluation metric changes to multiclass metric even num classes is 2 for multiclass objective
    gbm = lgb.LGBMClassifier(objective="multiclass", num_classes=2, **params).fit(
        eval_metric="binary_logloss", **params_fit
    )
    assert len(gbm._evals_result["training"]) == 1
    assert "multi_logloss" in gbm.evals_result_["training"]

    # the evaluation metric changes to multiclass metric even num classes is 2 for ovr objective
    gbm = lgb.LGBMClassifier(objective="ovr", num_classes=2, **params).fit(eval_metric="binary_error", **params_fit)
    assert gbm.objective_ == "ovr"
    assert len(gbm.evals_result_["training"]) == 2
    assert "multi_logloss" in gbm.evals_result_["training"]
    assert "multi_error" in gbm.evals_result_["training"]

1085
1086
1087
1088

def test_multiple_eval_metrics():
    X, y = load_breast_cancer(return_X_y=True)

1089
1090
    params = {"n_estimators": 2, "verbose": -1, "objective": "binary", "metric": "binary_logloss"}
    params_fit = {"X": X, "y": y, "eval_set": (X, y)}
1091
1092
1093

    # Verify that can receive a list of metrics, only callable
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric], **params_fit)
1094
1095
1096
1097
    assert len(gbm.evals_result_["training"]) == 3
    assert "error" in gbm.evals_result_["training"]
    assert "decreasing_metric" in gbm.evals_result_["training"]
    assert "binary_logloss" in gbm.evals_result_["training"]
1098
1099

    # Verify that can receive a list of custom and built-in metrics
1100
1101
1102
1103
1104
1105
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[constant_metric, decreasing_metric, "fair"], **params_fit)
    assert len(gbm.evals_result_["training"]) == 4
    assert "error" in gbm.evals_result_["training"]
    assert "decreasing_metric" in gbm.evals_result_["training"]
    assert "binary_logloss" in gbm.evals_result_["training"]
    assert "fair" in gbm.evals_result_["training"]
1106
1107
1108

    # Verify that works as expected when eval_metric is empty
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric=[], **params_fit)
1109
1110
    assert len(gbm.evals_result_["training"]) == 1
    assert "binary_logloss" in gbm.evals_result_["training"]
1111
1112

    # Verify that can receive a list of metrics, only built-in
1113
1114
1115
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric=["fair", "error"], **params_fit)
    assert len(gbm.evals_result_["training"]) == 3
    assert "binary_logloss" in gbm.evals_result_["training"]
1116
1117

    # Verify that eval_metric is robust to receiving a list with None
1118
1119
1120
    gbm = lgb.LGBMClassifier(**params).fit(eval_metric=["fair", "error", None], **params_fit)
    assert len(gbm.evals_result_["training"]) == 3
    assert "binary_logloss" in gbm.evals_result_["training"]
1121
1122


1123
def test_nan_handle(rng):
1124
1125
    nrows = 100
    ncols = 10
1126
1127
    X = rng.standard_normal(size=(nrows, ncols))
    y = rng.standard_normal(size=(nrows,)) + np.full(nrows, 1e30)
1128
    weight = np.zeros(nrows)
1129
1130
    params = {"n_estimators": 20, "verbose": -1}
    params_fit = {"X": X, "y": y, "sample_weight": weight, "eval_set": (X, y), "callbacks": [lgb.early_stopping(5)]}
1131
    gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
1132
    np.testing.assert_allclose(gbm.evals_result_["training"]["l2"], np.nan)
1133
1134


1135
1136
1137
@pytest.mark.skipif(
    getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
1138
1139
def test_first_metric_only():
    def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
1140
        params["first_metric_only"] = first_metric_only
1141
        gbm = lgb.LGBMRegressor(**params).fit(**params_fit)
1142
1143
1144
1145
1146
1147
1148
1149
        assert len(gbm.evals_result_) == len(eval_set_names)
        for eval_set_name in eval_set_names:
            assert eval_set_name in gbm.evals_result_
            assert len(gbm.evals_result_[eval_set_name]) == len(metric_names)
            for metric_name in metric_names:
                assert metric_name in gbm.evals_result_[eval_set_name]

                actual = len(gbm.evals_result_[eval_set_name][metric_name])
1150
1151
1152
1153
1154
                expected = assumed_iteration + (
                    params["early_stopping_rounds"]
                    if eval_set_name != "training" and assumed_iteration != gbm.n_estimators
                    else 0
                )
1155
                assert expected == actual
1156
                if eval_set_name != "training":
1157
1158
1159
1160
                    assert assumed_iteration == gbm.best_iteration_
                else:
                    assert gbm.n_estimators == gbm.best_iteration_

1161
    X, y = make_synthetic_regression(n_samples=300)
1162
1163
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=72)
1164
1165
1166
1167
1168
1169
1170
1171
1172
    params = {
        "n_estimators": 30,
        "learning_rate": 0.8,
        "num_leaves": 15,
        "verbose": -1,
        "seed": 123,
        "early_stopping_rounds": 5,
    }  # early stop should be supported via global LightGBM parameter
    params_fit = {"X": X_train, "y": y_train}
1173

1174
1175
1176
1177
    iter_valid1_l1 = 4
    iter_valid1_l2 = 4
    iter_valid2_l1 = 2
    iter_valid2_l2 = 2
1178
    assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
1179
1180
1181
1182
1183
1184
    iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
    iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
    iter_min = min([iter_min_l1, iter_min_l2])
    iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])

    # feval
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
    params["metric"] = "None"
    params_fit["eval_metric"] = lambda preds, train_data: [
        decreasing_metric(preds, train_data),
        constant_metric(preds, train_data),
    ]
    params_fit["eval_set"] = (X_test1, y_test1)
    fit_and_check(["valid_0"], ["decreasing_metric", "error"], 1, False)
    fit_and_check(["valid_0"], ["decreasing_metric", "error"], 30, True)
    params_fit["eval_metric"] = lambda preds, train_data: [
        constant_metric(preds, train_data),
        decreasing_metric(preds, train_data),
    ]
    fit_and_check(["valid_0"], ["decreasing_metric", "error"], 1, True)
1198
1199

    # single eval_set
1200
1201
1202
1203
    params.pop("metric")
    params_fit.pop("eval_metric")
    fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
    fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
1204

1205
1206
1207
    params_fit["eval_metric"] = "l2"
    fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
    fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
1208

1209
1210
1211
    params_fit["eval_metric"] = "l1"
    fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
    fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l1, True)
1212

1213
1214
1215
    params_fit["eval_metric"] = ["l1", "l2"]
    fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
    fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l1, True)
1216

1217
1218
1219
    params_fit["eval_metric"] = ["l2", "l1"]
    fit_and_check(["valid_0"], ["l1", "l2"], iter_min_valid1, False)
    fit_and_check(["valid_0"], ["l1", "l2"], iter_valid1_l2, True)
1220

1221
1222
1223
    params_fit["eval_metric"] = ["l2", "regression", "mse"]  # test aliases
    fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, False)
    fit_and_check(["valid_0"], ["l2"], iter_valid1_l2, True)
1224
1225

    # two eval_set
1226
1227
1228
1229
1230
    params_fit["eval_set"] = [(X_test1, y_test1), (X_test2, y_test2)]
    params_fit["eval_metric"] = ["l1", "l2"]
    fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l1, True)
    params_fit["eval_metric"] = ["l2", "l1"]
    fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l2, True)
1231

1232
1233
1234
1235
1236
1237
1238
    params_fit["eval_set"] = [(X_test2, y_test2), (X_test1, y_test1)]
    params_fit["eval_metric"] = ["l1", "l2"]
    fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min, False)
    fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l1, True)
    params_fit["eval_metric"] = ["l2", "l1"]
    fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min, False)
    fit_and_check(["valid_0", "valid_1"], ["l1", "l2"], iter_min_l2, True)
1239
1240
1241
1242
1243


def test_class_weight():
    X, y = load_digits(n_class=10, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
1244
1245
1246
1247
1248
1249
1250
1251
1252
    y_train_str = y_train.astype("str")
    y_test_str = y_test.astype("str")
    gbm = lgb.LGBMClassifier(n_estimators=10, class_weight="balanced", verbose=-1)
    gbm.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test), (X_test, y_test), (X_test, y_test), (X_test, y_test)],
        eval_class_weight=["balanced", None, "balanced", {1: 10, 4: 20}, {5: 30, 2: 40}],
    )
1253
1254
    for eval_set1, eval_set2 in itertools.combinations(gbm.evals_result_.keys(), 2):
        for metric in gbm.evals_result_[eval_set1]:
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
            np.testing.assert_raises(
                AssertionError,
                np.testing.assert_allclose,
                gbm.evals_result_[eval_set1][metric],
                gbm.evals_result_[eval_set2][metric],
            )
    gbm_str = lgb.LGBMClassifier(n_estimators=10, class_weight="balanced", verbose=-1)
    gbm_str.fit(
        X_train,
        y_train_str,
        eval_set=[
            (X_train, y_train_str),
            (X_test, y_test_str),
            (X_test, y_test_str),
            (X_test, y_test_str),
            (X_test, y_test_str),
        ],
        eval_class_weight=["balanced", None, "balanced", {"1": 10, "4": 20}, {"5": 30, "2": 40}],
    )
1274
1275
    for eval_set1, eval_set2 in itertools.combinations(gbm_str.evals_result_.keys(), 2):
        for metric in gbm_str.evals_result_[eval_set1]:
1276
1277
1278
1279
1280
1281
            np.testing.assert_raises(
                AssertionError,
                np.testing.assert_allclose,
                gbm_str.evals_result_[eval_set1][metric],
                gbm_str.evals_result_[eval_set2][metric],
            )
1282
1283
    for eval_set in gbm.evals_result_:
        for metric in gbm.evals_result_[eval_set]:
1284
            np.testing.assert_allclose(gbm.evals_result_[eval_set][metric], gbm_str.evals_result_[eval_set][metric])
1285
1286
1287
1288
1289


def test_continue_training_with_model():
    X, y = load_digits(n_class=3, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
1290
    init_gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test))
1291
1292
1293
1294
    gbm = lgb.LGBMClassifier(n_estimators=5).fit(X_train, y_train, eval_set=(X_test, y_test), init_model=init_gbm)
    assert len(init_gbm.evals_result_["valid_0"]["multi_logloss"]) == len(gbm.evals_result_["valid_0"]["multi_logloss"])
    assert len(init_gbm.evals_result_["valid_0"]["multi_logloss"]) == 5
    assert gbm.evals_result_["valid_0"]["multi_logloss"][-1] < init_gbm.evals_result_["valid_0"]["multi_logloss"][-1]
1295
1296


1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
def test_actual_number_of_trees():
    X = [[1, 2, 3], [1, 2, 3]]
    y = [1, 1]
    n_estimators = 5
    gbm = lgb.LGBMRegressor(n_estimators=n_estimators).fit(X, y)
    assert gbm.n_estimators == n_estimators
    assert gbm.n_estimators_ == 1
    assert gbm.n_iter_ == 1
    np.testing.assert_array_equal(gbm.predict(np.array(X) * 10), y)


1308
1309
1310
1311
1312
1313
1314
1315
1316
def test_check_is_fitted():
    X, y = load_digits(n_class=2, return_X_y=True)
    est = lgb.LGBMModel(n_estimators=5, objective="binary")
    clf = lgb.LGBMClassifier(n_estimators=5)
    reg = lgb.LGBMRegressor(n_estimators=5)
    rnk = lgb.LGBMRanker(n_estimators=5)
    models = (est, clf, reg, rnk)
    for model in models:
        with pytest.raises(lgb.compat.LGBMNotFittedError):
1317
            check_is_fitted(model)
1318
1319
1320
1321
1322
1323
    est.fit(X, y)
    clf.fit(X, y)
    reg.fit(X, y)
    rnk.fit(X, y, group=np.ones(X.shape[0]))
    for model in models:
        check_is_fitted(model)
1324
1325


1326
@pytest.mark.parametrize("estimator_class", estimator_classes)
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
@pytest.mark.parametrize("max_depth", [3, 4, 5, 8])
def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    params = {"n_estimators": 1, "max_depth": max_depth, "verbose": 0}
    if estimator_class is lgb.LGBMModel:
        estimator_class(**{**params, "objective": "binary"}).fit(X, y)
    elif estimator_class is lgb.LGBMRanker:
        estimator_class(**params).fit(X, y, group=np.ones(X.shape[0]))
    else:
        estimator_class(**params).fit(X, y)
    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
def test_verbosity_is_respected_when_using_custom_objective(capsys):
    X, y = make_synthetic_regression()
    params = {
        "objective": objective_ls,
        "nonsense": 123,
        "num_leaves": 3,
    }
    lgb.LGBMRegressor(**params, verbosity=-1, n_estimators=1).fit(X, y)
    assert capsys.readouterr().out == ""
    lgb.LGBMRegressor(**params, verbosity=0, n_estimators=1).fit(X, y)
    assert "[LightGBM] [Warning] Unknown parameter: nonsense" in capsys.readouterr().out


1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
def test_fit_only_raises_num_rounds_warning_when_expected(capsys):
    X, y = make_synthetic_regression()
    base_kwargs = {
        "num_leaves": 5,
        "verbosity": -1,
    }

    # no warning: no aliases, all defaults
    reg = lgb.LGBMRegressor(**base_kwargs).fit(X, y)
    assert reg.n_estimators_ == 100
    assert_silent(capsys)

    # no warning: no aliases, just n_estimators
    reg = lgb.LGBMRegressor(**base_kwargs, n_estimators=2).fit(X, y)
    assert reg.n_estimators_ == 2
    assert_silent(capsys)

    # no warning: 1 alias + n_estimators (both same value)
    reg = lgb.LGBMRegressor(**base_kwargs, n_estimators=3, n_iter=3).fit(X, y)
    assert reg.n_estimators_ == 3
    assert_silent(capsys)

    # no warning: 1 alias + n_estimators (different values... value from params should win)
    reg = lgb.LGBMRegressor(**base_kwargs, n_estimators=3, n_iter=4).fit(X, y)
    assert reg.n_estimators_ == 4
    assert_silent(capsys)

    # no warning: 2 aliases (both same value)
    reg = lgb.LGBMRegressor(**base_kwargs, n_iter=3, num_iterations=3).fit(X, y)
    assert reg.n_estimators_ == 3
    assert_silent(capsys)

    # no warning: 4 aliases (all same value)
    reg = lgb.LGBMRegressor(**base_kwargs, n_iter=3, num_trees=3, nrounds=3, max_iter=3).fit(X, y)
    assert reg.n_estimators_ == 3
    assert_silent(capsys)

    # warning: 2 aliases (different values... "num_iterations" wins because it's the main param name)
    with pytest.warns(UserWarning, match="LightGBM will perform up to 5 boosting rounds"):
        reg = lgb.LGBMRegressor(**base_kwargs, num_iterations=5, n_iter=6).fit(X, y)
    assert reg.n_estimators_ == 5
    # should not be any other logs (except the warning, intercepted by pytest)
    assert_silent(capsys)

    # warning: 2 aliases (different values... first one in the order from Config::parameter2aliases() wins)
    with pytest.warns(UserWarning, match="LightGBM will perform up to 4 boosting rounds"):
        reg = lgb.LGBMRegressor(**base_kwargs, n_iter=4, max_iter=5).fit(X, y)
    assert reg.n_estimators_ == 4
    # should not be any other logs (except the warning, intercepted by pytest)
    assert_silent(capsys)


1405
@pytest.mark.parametrize("estimator_class", estimator_classes)
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
def test_getting_feature_names_in_np_input(estimator_class):
    # input is a numpy array, which doesn't have feature names. LightGBM adds
    # feature names to the fitted model, which is inconsistent with sklearn's behavior
    X, y = load_digits(n_class=2, return_X_y=True)
    params = {"n_estimators": 2, "num_leaves": 7}
    if estimator_class is lgb.LGBMModel:
        model = estimator_class(**{**params, "objective": "binary"})
    else:
        model = estimator_class(**params)
    with pytest.raises(lgb.compat.LGBMNotFittedError):
        check_is_fitted(model)
    if isinstance(model, lgb.LGBMRanker):
        model.fit(X, y, group=[X.shape[0]])
    else:
        model.fit(X, y)
    np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))


1424
@pytest.mark.parametrize("estimator_class", estimator_classes)
1425
1426
1427
def test_getting_feature_names_in_pd_input(estimator_class):
    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
    col_names = X.columns.to_list()
1428
1429
1430
    assert isinstance(col_names, list) and all(isinstance(c, str) for c in col_names), (
        "input data must have feature names for this test to cover the expected functionality"
    )
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
    params = {"n_estimators": 2, "num_leaves": 7}
    if estimator_class is lgb.LGBMModel:
        model = estimator_class(**{**params, "objective": "binary"})
    else:
        model = estimator_class(**params)
    with pytest.raises(lgb.compat.LGBMNotFittedError):
        check_is_fitted(model)
    if isinstance(model, lgb.LGBMRanker):
        model.fit(X, y, group=[X.shape[0]])
    else:
        model.fit(X, y)
    np.testing.assert_array_equal(model.feature_names_in_, X.columns)


1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
# Starting with scikit-learn 1.6 (https://github.com/scikit-learn/scikit-learn/pull/30149),
# the only API for marking estimator tests as expected to fail is to pass a keyword argument
# to parametrize_with_checks(). That function didn't accept additional arguments in earlier
# versions.
#
# This block defines a patched version of parametrize_with_checks() so lightgbm's tests
# can be compatible with scikit-learn <1.6 and >=1.6.
#
# This should be removed once minimum supported scikit-learn version is at least 1.6.
if SKLEARN_VERSION_GTE_1_6:
    parametrize_with_checks = sklearn_parametrize_with_checks
else:

    def parametrize_with_checks(estimator, *args, **kwargs):
        return sklearn_parametrize_with_checks(estimator)


def _get_expected_failed_tests(estimator):
    return estimator._more_tags()["_xfail_checks"]


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()], expected_failed_checks=_get_expected_failed_tests)
1467
1468
1469
def test_sklearn_integration(estimator, check):
    estimator.set_params(min_child_samples=1, min_data_in_bin=1)
    check(estimator)
1470
1471


1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
@pytest.mark.parametrize("estimator_class", estimator_classes)
def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class):
    est = estimator_class()
    more_tags = est._more_tags()
    err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn_tags__() to match."
    assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg
    # the try-except part of this should be removed once lightgbm's
    # minimum supported scikit-learn version is at least 1.6
    try:
        sklearn_tags = est.__sklearn_tags__()
    except AttributeError as err:
        # only the exact error we expected to be raised should be raised
        assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err)))
    else:
        # if no AttributeError was thrown, we must be using scikit-learn>=1.6,
        # and so the actual effects of __sklearn_tags__() should be tested
        assert sklearn_tags.input_tags.allow_nan is True
        assert sklearn_tags.input_tags.sparse is True
        assert sklearn_tags.target_tags.one_d_labels is True
1491
1492
1493
1494
1495
1496
        if estimator_class is lgb.LGBMClassifier:
            assert sklearn_tags.estimator_type == "classifier"
            assert sklearn_tags.classifier_tags.multi_class is True
            assert sklearn_tags.classifier_tags.multi_label is False
        elif estimator_class is lgb.LGBMRegressor:
            assert sklearn_tags.estimator_type == "regressor"
1497
1498
1499


@pytest.mark.parametrize("task", all_tasks)
1500
1501
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
    pd = pytest.importorskip("pandas")
1502
    X, y, g = _create_data(task)
1503
1504
    X = pd.DataFrame(X)
    y_col_array = y.reshape(-1, 1)
1505
    params = {"n_estimators": 1, "num_leaves": 3, "random_state": 0}
1506
    model_factory = task_to_model_factory[task]
1507
1508
    with pytest.warns(UserWarning, match="column-vector"):
        if task == "ranking":
1509
1510
1511
1512
1513
1514
1515
1516
1517
            model_1d = model_factory(**params).fit(X, y, group=g)
            model_2d = model_factory(**params).fit(X, y_col_array, group=g)
        else:
            model_1d = model_factory(**params).fit(X, y)
            model_2d = model_factory(**params).fit(X, y_col_array)

    preds_1d = model_1d.predict(X)
    preds_2d = model_2d.predict(X)
    np.testing.assert_array_equal(preds_1d, preds_2d)
1518
1519


1520
@pytest.mark.parametrize("use_weight", [True, False])
1521
def test_multiclass_custom_objective(use_weight):
1522
1523
    centers = [[-4, -4], [4, 4], [-4, 4]]
    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
1524
    weight = np.full_like(y, 2) if use_weight else None
1525
    params = {"n_estimators": 10, "num_leaves": 7}
1526
    builtin_obj_model = lgb.LGBMClassifier(**params)
1527
    builtin_obj_model.fit(X, y, sample_weight=weight)
1528
1529
1530
    builtin_obj_preds = builtin_obj_model.predict_proba(X)

    custom_obj_model = lgb.LGBMClassifier(objective=sklearn_multiclass_custom_objective, **params)
1531
    custom_obj_model.fit(X, y, sample_weight=weight)
1532
1533
1534
1535
1536
    custom_obj_preds = softmax(custom_obj_model.predict(X, raw_score=True))

    np.testing.assert_allclose(builtin_obj_preds, custom_obj_preds, rtol=0.01)
    assert not callable(builtin_obj_model.objective_)
    assert callable(custom_obj_model.objective_)
1537
1538


1539
@pytest.mark.parametrize("use_weight", [True, False])
1540
1541
1542
def test_multiclass_custom_eval(use_weight):
    def custom_eval(y_true, y_pred, weight):
        loss = log_loss(y_true, y_pred, sample_weight=weight)
1543
        return "custom_logloss", loss, False
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554

    centers = [[-4, -4], [4, 4], [-4, 4]]
    X, y = make_blobs(n_samples=1_000, centers=centers, random_state=42)
    train_test_split_func = partial(train_test_split, test_size=0.2, random_state=0)
    X_train, X_valid, y_train, y_valid = train_test_split_func(X, y)
    if use_weight:
        weight = np.full_like(y, 2)
        weight_train, weight_valid = train_test_split_func(weight)
    else:
        weight_train = None
        weight_valid = None
1555
    params = {"objective": "multiclass", "num_class": 3, "num_leaves": 7}
1556
1557
1558
1559
1560
1561
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        sample_weight=weight_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
1562
        eval_names=["train", "valid"],
1563
1564
1565
1566
1567
1568
        eval_sample_weight=[weight_train, weight_valid],
        eval_metric=custom_eval,
    )
    eval_result = model.evals_result_
    train_ds = (X_train, y_train, weight_train)
    valid_ds = (X_valid, y_valid, weight_valid)
1569
1570
    for key, (X, y_true, weight) in zip(["train", "valid"], [train_ds, valid_ds]):
        np.testing.assert_allclose(eval_result[key]["multi_logloss"], eval_result[key]["custom_logloss"])
1571
1572
        y_pred = model.predict_proba(X)
        _, metric_value, _ = custom_eval(y_true, y_pred, weight)
1573
        np.testing.assert_allclose(metric_value, eval_result[key]["custom_logloss"][-1])
1574
1575


1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
def test_negative_n_jobs(tmp_path):
    n_threads = joblib.cpu_count()
    if n_threads <= 1:
        return None
    # 'val_minus_two' here is the expected number of threads for n_jobs=-2
    val_minus_two = n_threads - 1
    X, y = load_breast_cancer(return_X_y=True)
    # Note: according to joblib's formula, a value of n_jobs=-2 means
    # "use all but one thread" (formula: n_cpus + 1 + n_jobs)
    gbm = lgb.LGBMClassifier(n_estimators=2, verbose=-1, n_jobs=-2).fit(X, y)
    gbm.booster_.save_model(tmp_path / "model.txt")
    with open(tmp_path / "model.txt", "r") as f:
        model_txt = f.read()
    assert bool(re.search(rf"\[num_threads: {val_minus_two}\]", model_txt))


def test_default_n_jobs(tmp_path):
    n_cores = joblib.cpu_count(only_physical_cores=True)
    X, y = load_breast_cancer(return_X_y=True)
    gbm = lgb.LGBMClassifier(n_estimators=2, verbose=-1, n_jobs=None).fit(X, y)
    gbm.booster_.save_model(tmp_path / "model.txt")
    with open(tmp_path / "model.txt", "r") as f:
        model_txt = f.read()
    assert bool(re.search(rf"\[num_threads: {n_cores}\]", model_txt))
1600
1601


1602
@pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
1603
@pytest.mark.parametrize("task", all_tasks)
1604
def test_validate_features(task):
1605
    X, y, g = _create_data(task, n_features=4)
1606
    features = ["x1", "x2", "x3", "x4"]
1607
1608
    df = pd_DataFrame(X, columns=features)
    model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1)
1609
    if task == "ranking":
1610
1611
1612
1613
1614
1615
        model.fit(df, y, group=g)
    else:
        model.fit(df, y)
    assert model.feature_name_ == features

    # try to predict with a different feature
1616
    df2 = df.rename(columns={"x2": "z"})
1617
1618
1619
1620
1621
    with pytest.raises(lgb.basic.LightGBMError, match="Expected 'x2' at position 1 but found 'z'"):
        model.predict(df2, validate_features=True)

    # check that disabling the check doesn't raise the error
    model.predict(df2, validate_features=False)
1622
1623


1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
# LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by
# its scikit-learn estimators, for consistency with scikit-learn's own behavior.
@pytest.mark.parametrize("task", all_tasks)
@pytest.mark.parametrize("predict_disable_shape_check", [True, False])
def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check, task):
    X, y, g = _create_data(task, n_features=4)
    model_factory = task_to_model_factory[task]
    fit_kwargs = {"X": X[:, :-1], "y": y}
    if task == "ranking":
        estimator_name = "LGBMRanker"
        fit_kwargs.update({"group": g})
    elif task == "regression":
        estimator_name = "LGBMRegressor"
    else:
        estimator_name = "LGBMClassifier"

    # train on the first 3 features
    model = model_factory(n_estimators=5, num_leaves=7, verbose=-1).fit(**fit_kwargs)

    # more cols in X than features: error
    err_msg = f"X has 4 features, but {estimator_name} is expecting 3 features as input"
    with pytest.raises(ValueError, match=err_msg):
        model.predict(X, predict_disable_shape_check=predict_disable_shape_check)

    if estimator_name == "LGBMClassifier":
        with pytest.raises(ValueError, match=err_msg):
            model.predict_proba(X, predict_disable_shape_check=predict_disable_shape_check)

    # fewer cols in X than features: error
    err_msg = f"X has 2 features, but {estimator_name} is expecting 3 features as input"
    with pytest.raises(ValueError, match=err_msg):
        model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)

    if estimator_name == "LGBMClassifier":
        with pytest.raises(ValueError, match=err_msg):
            model.predict_proba(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)

    # same number of columns in both: no error
    preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
    assert preds.shape == y.shape

    if estimator_name == "LGBMClassifier":
        preds = model.predict_proba(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
        assert preds.shape[0] == y.shape[0]


1670
1671
1672
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
1673
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
1674
    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
1675
        pytest.skip("pandas is not installed")
1676
    if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
1677
        pytest.skip("datatable is not installed")
1678
    X, y, g = _create_data(task, n_samples=2_000)
1679
    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
1680

1681
    if task == "binary-classification" or task == "regression":
1682
        init_score = np.full_like(y, np.mean(y))
1683
    elif task == "multiclass-classification":
1684
1685
1686
1687
1688
        init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
    else:
        raise ValueError(f"Unrecognized task '{task}'")

    X_valid = X * 2
1689
    if X_type == "dt_DataTable":
1690
        X = dt_DataTable(X)
1691
    elif X_type == "list2d":
1692
        X = X.tolist()
1693
    elif X_type == "scipy_csc":
1694
        X = scipy.sparse.csc_matrix(X)
1695
    elif X_type == "scipy_csr":
1696
        X = scipy.sparse.csr_matrix(X)
1697
    elif X_type == "pd_DataFrame":
1698
        X = pd_DataFrame(X)
1699
    elif X_type != "numpy":
1700
1701
        raise ValueError(f"Unrecognized X_type: '{X_type}'")

1702
1703
    # make weights and init_score same types as y, just to avoid
    # a huge number of combinations and therefore test cases
1704
    if y_type == "list1d":
1705
        y = y.tolist()
1706
1707
        weights = weights.tolist()
        init_score = init_score.tolist()
1708
    elif y_type == "pd_DataFrame":
1709
        y = pd_DataFrame(y)
1710
        weights = pd_Series(weights)
1711
        if task == "multiclass-classification":
1712
1713
1714
            init_score = pd_DataFrame(init_score)
        else:
            init_score = pd_Series(init_score)
1715
    elif y_type == "pd_Series":
1716
        y = pd_Series(y)
1717
        weights = pd_Series(weights)
1718
        if task == "multiclass-classification":
1719
1720
1721
            init_score = pd_DataFrame(init_score)
        else:
            init_score = pd_Series(init_score)
1722
    elif y_type != "numpy":
1723
1724
1725
        raise ValueError(f"Unrecognized y_type: '{y_type}'")

    model = task_to_model_factory[task](n_estimators=10, verbose=-1)
1726
1727
1728
1729
1730
1731
1732
    model.fit(
        X=X,
        y=y,
        sample_weight=weights,
        init_score=init_score,
        eval_set=[(X_valid, y)],
        eval_sample_weight=[weights],
1733
        eval_init_score=[init_score],
1734
    )
1735
1736

    preds = model.predict(X)
1737
    if task == "binary-classification":
1738
        assert accuracy_score(y, preds) >= 0.99
1739
    elif task == "multiclass-classification":
1740
        assert accuracy_score(y, preds) >= 0.99
1741
    elif task == "regression":
1742
1743
1744
1745
1746
        assert r2_score(y, preds) > 0.86
    else:
        raise ValueError(f"Unrecognized task: '{task}'")


1747
1748
1749
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
1750
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
1751
    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
1752
        pytest.skip("pandas is not installed")
1753
    if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
1754
1755
        pytest.skip("datatable is not installed")
    X, y, g = _create_data(task="ranking", n_samples=1_000)
1756
    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
1757
1758
1759
    init_score = np.full_like(y, np.mean(y))
    X_valid = X * 2

1760
    if X_type == "dt_DataTable":
1761
        X = dt_DataTable(X)
1762
    elif X_type == "list2d":
1763
        X = X.tolist()
1764
    elif X_type == "scipy_csc":
1765
        X = scipy.sparse.csc_matrix(X)
1766
    elif X_type == "scipy_csr":
1767
        X = scipy.sparse.csr_matrix(X)
1768
    elif X_type == "pd_DataFrame":
1769
        X = pd_DataFrame(X)
1770
    elif X_type != "numpy":
1771
1772
        raise ValueError(f"Unrecognized X_type: '{X_type}'")

1773
1774
    # make weights and init_score same types as y, just to avoid
    # a huge number of combinations and therefore test cases
1775
    if y_type == "list1d":
1776
        y = y.tolist()
1777
1778
        weights = weights.tolist()
        init_score = init_score.tolist()
1779
    elif y_type == "pd_DataFrame":
1780
        y = pd_DataFrame(y)
1781
1782
        weights = pd_Series(weights)
        init_score = pd_Series(init_score)
1783
    elif y_type == "pd_Series":
1784
        y = pd_Series(y)
1785
1786
        weights = pd_Series(weights)
        init_score = pd_Series(init_score)
1787
    elif y_type != "numpy":
1788
1789
        raise ValueError(f"Unrecognized y_type: '{y_type}'")

1790
    if g_type == "list1d_float":
1791
        g = g.astype("float").tolist()
1792
    elif g_type == "list1d_int":
1793
        g = g.astype("int").tolist()
1794
    elif g_type == "pd_Series":
1795
        g = pd_Series(g)
1796
    elif g_type != "numpy":
1797
1798
        raise ValueError(f"Unrecognized g_type: '{g_type}'")

1799
    model = task_to_model_factory["ranking"](n_estimators=10, verbose=-1)
1800
1801
1802
1803
1804
1805
1806
1807
1808
    model.fit(
        X=X,
        y=y,
        sample_weight=weights,
        init_score=init_score,
        group=g,
        eval_set=[(X_valid, y)],
        eval_sample_weight=[weights],
        eval_init_score=[init_score],
1809
        eval_group=[g],
1810
    )
1811
1812
    preds = model.predict(X)
    assert spearmanr(preds, y).correlation >= 0.99
1813
1814
1815
1816
1817
1818
1819
1820


def test_classifier_fit_detects_classes_every_time():
    rng = np.random.default_rng(seed=123)
    nrows = 1000
    ncols = 20

    X = rng.standard_normal(size=(nrows, ncols))
1821
    y_bin = (rng.random(size=nrows) <= 0.3).astype(np.float64)
1822
1823
1824
1825
1826
1827
1828
1829
    y_multi = rng.integers(4, size=nrows)

    model = lgb.LGBMClassifier(verbose=-1)
    for _ in range(2):
        model.fit(X, y_multi)
        assert model.objective_ == "multiclass"
        model.fit(X, y_bin)
        assert model.objective_ == "binary"