"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "3d9ada76574e3e246155f4410f285c334f148dec"
test_basic.py 40.3 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
3
import filecmp
import numbers
4
import re
5
from copy import deepcopy
6
from os import getenv
7
from pathlib import Path
wxchan's avatar
wxchan committed
8

wxchan's avatar
wxchan committed
9
import numpy as np
10
import pytest
11
from scipy import sparse
12
from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
wxchan's avatar
wxchan committed
13
from sklearn.model_selection import train_test_split
wxchan's avatar
wxchan committed
14

15
import lightgbm as lgb
16
from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
17

18
from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
19

wxchan's avatar
wxchan committed
20

21
def test_basic(tmp_path):
22
23
24
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
25
26
27
    feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
    feature_names[1] = "a" * 1000  # set one name to a value longer than default buffer size
    train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
28
29
30
31
32
33
34
35
36
37
    valid_data = train_data.create_valid(X_test, label=y_test)

    params = {
        "objective": "binary",
        "metric": "auc",
        "min_data": 10,
        "num_leaves": 15,
        "verbose": -1,
        "num_threads": 1,
        "max_bin": 255,
38
        "gpu_use_dp": True,
39
40
41
42
43
44
45
46
47
    }
    bst = lgb.Booster(params, train_data)
    bst.add_valid(valid_data, "valid_1")

    for i in range(20):
        bst.update()
        if i % 10 == 0:
            print(bst.eval_train(), bst.eval_valid())

48
49
    assert train_data.get_feature_name() == feature_names

50
51
52
    assert bst.current_iteration() == 20
    assert bst.num_trees() == 20
    assert bst.num_model_per_iteration() == 1
53
    if getenv("TASK", "") != "cuda":
54
55
        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
        assert bst.upper_bound() == pytest.approx(3.3182142872462883)
56

57
58
    tname = tmp_path / "svm_light.dat"
    model_file = tmp_path / "model.txt"
59
60
61
62
63
64
65
66
67
68

    bst.save_model(model_file)
    pred_from_matr = bst.predict(X_test)
    with open(tname, "w+b") as f:
        dump_svmlight_file(X_test, y_test, f)
    pred_from_file = bst.predict(tname)
    np.testing.assert_allclose(pred_from_matr, pred_from_file)

    # check saved model persistence
    bst = lgb.Booster(params, model_file=model_file)
69
    assert bst.feature_name() == feature_names
70
71
72
73
74
75
76
77
78
79
80
81
82
    pred_from_model_file = bst.predict(X_test)
    # we need to check the consistency of model file here, so test for exact equal
    np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)

    # check early stopping is working. Make it stop very early, so the scores should be very close to zero
    pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
    pred_early_stopping = bst.predict(X_test, **pred_parameter)
    # scores likely to be different, but prediction should still be the same
    np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))

    # test that shape is checked during prediction
    bad_X_test = X_test[:, 1:]
    bad_shape_error_msg = "The number of features in data*"
83
84
85
86
87
88
89
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
    np.testing.assert_raises_regex(
        lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
    )
    np.testing.assert_raises_regex(
        lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
    )
90
91
    with open(tname, "w+b") as f:
        dump_svmlight_file(bad_X_test, y_test, f)
92
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
93
94
    with open(tname, "w+b") as f:
        dump_svmlight_file(X_test, y_test, f, zero_based=False)
95
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
96
97


98
99
100
101
102
103
104
105
106
107
108
109
110
class NumpySequence(lgb.Sequence):
    def __init__(self, ndarray, batch_size):
        self.ndarray = ndarray
        self.batch_size = batch_size

    def __getitem__(self, idx):
        # The simple implementation is just a single "return self.ndarray[idx]"
        # The following is for demo and testing purpose.
        if isinstance(idx, numbers.Integral):
            return self.ndarray[idx]
        elif isinstance(idx, slice):
            if not (idx.step is None or idx.step == 1):
                raise NotImplementedError("No need to implement, caller will not set step by now")
111
            return self.ndarray[idx.start : idx.stop]
112
113
        elif isinstance(idx, list):
            return self.ndarray[idx]
114
        else:
115
            raise TypeError(f"Sequence Index must be an integer/list/slice, got {type(idx).__name__}")
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

    def __len__(self):
        return len(self.ndarray)


def _create_sequence_from_ndarray(data, num_seq, batch_size):
    if num_seq == 1:
        return NumpySequence(data, batch_size)

    nrow = data.shape[0]
    seqs = []
    seq_size = nrow // num_seq
    for start in range(0, nrow, seq_size):
        end = min(start + seq_size, nrow)
        seq = NumpySequence(data[start:end], batch_size)
        seqs.append(seq)
    return seqs


135
136
137
138
@pytest.mark.parametrize("sample_count", [11, 100, None])
@pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize("num_seq", [1, 3])
139
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq, rng):
140
    params = {"bin_construct_sample_cnt": sample_count}
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

    nrow = 50
    half_nrow = nrow // 2
    ncol = 11
    data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))

    if include_0_and_nan:
        # whole col
        data[:, 0] = 0
        data[:, 1] = np.nan

        # half col
        data[:half_nrow, 3] = 0
        data[:half_nrow, 2] = np.nan

        data[half_nrow:-2, 4] = 0
        data[:half_nrow, 4] = np.nan

    X = data[:, :-1]
    Y = data[:, -1]

162
163
    npy_bin_fname = tmpdir / "data_from_npy.bin"
    seq_bin_fname = tmpdir / "data_from_seq.bin"
164
165
166
167
168
169
170
171
172
173
174
175
176
177

    # Create dataset from numpy array directly.
    ds = lgb.Dataset(X, label=Y, params=params)
    ds.save_binary(npy_bin_fname)

    # Create dataset using Sequence.
    seqs = _create_sequence_from_ndarray(X, num_seq, batch_size)
    seq_ds = lgb.Dataset(seqs, label=Y, params=params)
    seq_ds.save_binary(seq_bin_fname)

    assert filecmp.cmp(npy_bin_fname, seq_bin_fname)

    # Test for validation set.
    # Select some random rows as valid data.
178
    valid_idx = (rng.random(10) * nrow).astype(np.int32)
179
180
181
182
    valid_data = data[valid_idx, :]
    valid_X = valid_data[:, :-1]
    valid_Y = valid_data[:, -1]

183
184
185
    valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
    valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
    valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

    valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
    valid_ds.save_binary(valid_npy_bin_fname)

    # From Dataset constructor, with dataset from numpy array.
    valid_seqs = _create_sequence_from_ndarray(valid_X, num_seq, batch_size)
    valid_seq_ds = lgb.Dataset(valid_seqs, label=valid_Y, params=params, reference=ds)
    valid_seq_ds.save_binary(valid_seq_bin_fname)
    assert filecmp.cmp(valid_npy_bin_fname, valid_seq_bin_fname)

    # From Dataset.create_valid, with dataset from sequence.
    valid_seq_ds2 = seq_ds.create_valid(valid_seqs, label=valid_Y, params=params)
    valid_seq_ds2.save_binary(valid_seq2_bin_fname)
    assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)


202
@pytest.mark.parametrize("num_seq", [1, 2])
203
def test_sequence_get_data(num_seq, rng):
204
205
206
207
208
209
    nrow = 20
    ncol = 11
    data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))
    X = data[:, :-1]
    Y = data[:, -1]

210
211
212
    seqs = _create_sequence_from_ndarray(data=X, num_seq=num_seq, batch_size=6)
    seq_ds = lgb.Dataset(seqs, label=Y, params=None, free_raw_data=False).construct()
    assert seq_ds.get_data() == seqs
213

214
    used_indices = rng.choice(a=np.arange(nrow), size=nrow // 3, replace=False)
215
216
    subset_data = seq_ds.subset(used_indices).construct()
    np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)])
217
218


219
def test_chunked_dataset():
220
221
222
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
223
224

    chunk_size = X_train.shape[0] // 10 + 1
225
226
    X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
    X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
227
228
229
230
231
232
233
234

    train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
    valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
    train_data.construct()
    valid_data.construct()


def test_chunked_dataset_linear():
235
236
237
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
238
    chunk_size = X_train.shape[0] // 10 + 1
239
240
241
    X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
    X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
    params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
242
243
244
245
246
247
    train_data = lgb.Dataset(X_train, label=y_train, params=params)
    valid_data = train_data.create_valid(X_test, label=y_test, params=params)
    train_data.construct()
    valid_data.construct()


248
249
def test_save_dataset_subset_and_load_from_file(tmp_path, rng):
    data = rng.standard_normal(size=(100, 2))
250
    params = {"max_bin": 50, "min_data_in_bin": 10}
251
    ds = lgb.Dataset(data, params=params)
252
253
    ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
    lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
254
255


256
def test_subset_group():
257
258
259
    rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
    X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
    q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
260
261
262
263
264
265
266
267
268
    lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
    assert len(lgb_train.get_group()) == 201
    subset = lgb_train.subset(list(range(10))).construct()
    subset_group = subset.get_group()
    assert len(subset_group) == 2
    assert subset_group[0] == 1
    assert subset_group[1] == 9


269
270
271
def test_add_features_throws_if_num_data_unequal(rng):
    X1 = rng.uniform(size=(100, 1))
    X2 = rng.uniform(size=(10, 1))
272
273
    d1 = lgb.Dataset(X1).construct()
    d2 = lgb.Dataset(X2).construct()
274
275
276
    with pytest.raises(
        lgb.basic.LightGBMError, match="Cannot add features from other Dataset with a different number of rows"
    ):
277
278
279
        d1.add_features_from(d2)


280
281
282
def test_add_features_throws_if_datasets_unconstructed(rng):
    X1 = rng.uniform(size=(100, 1))
    X2 = rng.uniform(size=(100, 1))
283
    err_msg = "Both source and target Datasets must be constructed before adding features"
284
285
    d1 = lgb.Dataset(X1)
    d2 = lgb.Dataset(X2)
286
    with pytest.raises(ValueError, match=err_msg):
287
        d1.add_features_from(d2)
288
289
    d1 = lgb.Dataset(X1).construct()
    d2 = lgb.Dataset(X2)
290
    with pytest.raises(ValueError, match=err_msg):
291
        d1.add_features_from(d2)
292
293
    d1 = lgb.Dataset(X1)
    d2 = lgb.Dataset(X2).construct()
294
    with pytest.raises(ValueError, match=err_msg):
295
        d1.add_features_from(d2)
296
297


298
299
def test_add_features_equal_data_on_alternating_used_unused(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
300
    X[:, [1, 3]] = 0
301
    names = [f"col_{i}" for i in range(5)]
302
303
304
305
    for j in range(1, 5):
        d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
        d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
        d1.add_features_from(d2)
306
        d1name = tmp_path / "d1.txt"
307
308
        d1._dump_text(d1name)
        d = lgb.Dataset(X, feature_name=names).construct()
309
        dname = tmp_path / "d.txt"
310
        d._dump_text(dname)
311
        with open(d1name, "rt") as d1f:
312
            d1txt = d1f.read()
313
        with open(dname, "rt") as df:
314
315
            dtxt = df.read()
        assert dtxt == d1txt
Guolin Ke's avatar
Guolin Ke committed
316

317

318
319
def test_add_features_same_booster_behaviour(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
320
    X[:, [1, 3]] = 0
321
    names = [f"col_{i}" for i in range(5)]
322
323
324
325
326
    for j in range(1, 5):
        d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
        d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
        d1.add_features_from(d2)
        d = lgb.Dataset(X, feature_name=names).construct()
327
        y = rng.uniform(size=(100,))
328
329
330
331
        d1.set_label(y)
        d.set_label(y)
        b1 = lgb.Booster(train_set=d1)
        b = lgb.Booster(train_set=d)
332
        for _ in range(10):
333
334
            b.update()
            b1.update()
335
336
        dname = tmp_path / "d.txt"
        d1name = tmp_path / "d1.txt"
337
338
        b1.save_model(d1name)
        b.save_model(dname)
339
        with open(dname, "rt") as df:
340
            dtxt = df.read()
341
        with open(d1name, "rt") as d1f:
342
343
344
345
            d1txt = d1f.read()
        assert dtxt == d1txt


346
def test_add_features_from_different_sources(rng):
347
348
349
    pd = pytest.importorskip("pandas")
    n_row = 100
    n_col = 5
350
    X = rng.uniform(size=(n_row, n_col))
351
    xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
352
    names = [f"col_{i}" for i in range(n_col)]
353
354
    seq = _create_sequence_from_ndarray(X, 1, 30)
    seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
355
356
357
    npy_list_ds = lgb.Dataset(
        [X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
    ).construct()
358
    immergeable_dds = [seq_ds, npy_list_ds]
359
360
361
362
363
364
365
366
367
    for x_1 in xxs:
        # test that method works even with free_raw_data=True
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
        d2 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
        d1.add_features_from(d2)
        assert d1.data is None

        # test that method works but sets raw data to None in case of immergeable data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
368
369
370
        for d2 in immergeable_dds:
            d1.add_features_from(d2)
            assert d1.data is None
371
372
373

        # test that method works for different data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
374
        res_feature_names = deepcopy(names)
375
376
377
378
379
380
        for idx, x_2 in enumerate(xxs, 2):
            original_type = type(d1.get_data())
            d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
            d1.add_features_from(d2)
            assert isinstance(d1.get_data(), original_type)
            assert d1.get_data().shape == (n_row, n_col * idx)
381
            res_feature_names += [f"D{idx}_{name}" for name in names]
382
383
384
            assert d1.feature_name == res_feature_names


385
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys, rng):
386
    arr_a = np.zeros((100, 1), dtype=np.float32)
387
    arr_b = rng.uniform(size=(100, 5))
388

389
    dataset_a = lgb.Dataset(arr_a, params={"verbose": 0}).construct()
390
    expected_msg = (
391
392
393
        "[LightGBM] [Warning] There are no meaningful features which satisfy "
        "the provided configuration. Decreasing Dataset parameters min_data_in_bin "
        "or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
394
395
396
397
398
399
    )
    log_lines = capsys.readouterr().out
    assert expected_msg in log_lines

    dataset_b = lgb.Dataset(arr_b).construct()

400
    original_handle = dataset_a._handle.value
401
402
403
    dataset_a.add_features_from(dataset_b)
    assert dataset_a.num_feature() == 6
    assert dataset_a.num_data() == 100
404
    assert dataset_a._handle.value == original_handle
405
406


407
408
def test_cegb_affects_behavior(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
409
    X[:, [1, 3]] = 0
410
    y = rng.uniform(size=(100,))
411
    names = [f"col_{i}" for i in range(5)]
412
413
414
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    base = lgb.Booster(train_set=ds)
415
    for _ in range(10):
416
        base.update()
417
    basename = tmp_path / "basename.txt"
418
    base.save_model(basename)
419
    with open(basename, "rt") as f:
420
421
        basetxt = f.read()
    # Set extremely harsh penalties, so CEGB will block most splits.
422
423
424
425
426
    cases = [
        {"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
        {"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
        {"cegb_penalty_split": 1},
    ]
427
428
    for case in cases:
        booster = lgb.Booster(train_set=ds, params=case)
429
        for _ in range(10):
430
            booster.update()
431
        casename = tmp_path / "casename.txt"
432
        booster.save_model(casename)
433
        with open(casename, "rt") as f:
434
435
436
437
            casetxt = f.read()
        assert basetxt != casetxt


438
439
def test_cegb_scaling_equalities(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
440
    X[:, [1, 3]] = 0
441
    y = rng.uniform(size=(100,))
442
    names = [f"col_{i}" for i in range(5)]
443
444
445
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    # Compare pairs of penalties, to ensure scaling works as intended
446
447
448
449
450
451
452
453
454
455
456
457
    pairs = [
        (
            {"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
            {"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
        ),
        (
            {"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
            {"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
        ),
        ({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
    ]
    for p1, p2 in pairs:
458
459
        booster1 = lgb.Booster(train_set=ds, params=p1)
        booster2 = lgb.Booster(train_set=ds, params=p2)
460
        for _ in range(10):
461
462
            booster1.update()
            booster2.update()
463
        p1name = tmp_path / "p1.txt"
464
465
466
        # Reset booster1's parameters to p2, so the parameter section of the file matches.
        booster1.reset_parameter(p2)
        booster1.save_model(p1name)
467
        with open(p1name, "rt") as f:
468
            p1txt = f.read()
469
        p2name = tmp_path / "p2.txt"
470
        booster2.save_model(p2name)
471
        with open(p2name, "rt") as f:
472
473
474
475
476
477
478
            p2txt = f.read()
        assert p1txt == p2txt


def test_consistent_state_for_dataset_fields():
    def check_asserts(data):
        np.testing.assert_allclose(data.label, data.get_label())
479
        np.testing.assert_allclose(data.label, data.get_field("label"))
480
481
482
        assert not np.isnan(data.label[0])
        assert not np.isinf(data.label[1])
        np.testing.assert_allclose(data.weight, data.get_weight())
483
        np.testing.assert_allclose(data.weight, data.get_field("weight"))
484
485
486
        assert not np.isnan(data.weight[0])
        assert not np.isinf(data.weight[1])
        np.testing.assert_allclose(data.init_score, data.get_init_score())
487
        np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
488
489
        assert not np.isnan(data.init_score[0])
        assert not np.isinf(data.init_score[1])
490
        assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
491
492
493
494
495
496
497
        assert data.label[1] == pytest.approx(data.weight[1])
        assert data.feature_name == data.get_feature_name()

    X, y = load_breast_cancer(return_X_y=True)
    sequence = np.ones(y.shape[0])
    sequence[0] = np.nan
    sequence[1] = np.inf
498
499
    feature_names = [f"f{i}" for i in range(X.shape[1])]
    lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
500
501
502
503
504
505
506
    check_asserts(lgb_data)
    lgb_data = lgb.Dataset(X, y).construct()
    lgb_data.set_label(sequence)
    lgb_data.set_weight(sequence)
    lgb_data.set_init_score(sequence)
    lgb_data.set_feature_name(feature_names)
    check_asserts(lgb_data)
507
508


509
510
511
512
def test_dataset_construction_overwrites_user_provided_metadata_fields():
    X = np.array([[1.0, 2.0], [3.0, 4.0]])

    position = np.array([0.0, 1.0], dtype=np.float32)
513
    if getenv("TASK", "") == "cuda":
514
515
516
517
        position = None

    dtrain = lgb.Dataset(
        X,
518
        params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
519
520
521
522
523
524
525
526
527
528
529
530
531
532
        group=[1, 1],
        init_score=[0.312, 0.708],
        label=[1, 2],
        position=position,
        weight=[0.5, 1.5],
    )

    # unconstructed, get_* methods should return whatever was provided
    assert dtrain.group == [1, 1]
    assert dtrain.get_group() == [1, 1]
    assert dtrain.init_score == [0.312, 0.708]
    assert dtrain.get_init_score() == [0.312, 0.708]
    assert dtrain.label == [1, 2]
    assert dtrain.get_label() == [1, 2]
533
534
535
    if getenv("TASK", "") != "cuda":
        np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
        np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
    assert dtrain.weight == [0.5, 1.5]
    assert dtrain.get_weight() == [0.5, 1.5]

    # before construction, get_field() should raise an exception
    for field_name in ["group", "init_score", "label", "position", "weight"]:
        with pytest.raises(Exception, match=f"Cannot get {field_name} before construct Dataset"):
            dtrain.get_field(field_name)

    # constructed, get_* methods should return numpy arrays, even when the provided
    # input was a list of floats or ints
    dtrain.construct()
    expected_group = np.array([1, 1], dtype=np.int32)
    np_assert_array_equal(dtrain.group, expected_group, strict=True)
    np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
    # get_field("group") returns a numpy array with boundaries, instead of size
551
    np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
552

553
554
555
    expected_init_score = np.array(
        [0.312, 0.708],
    )
556
557
558
559
560
561
562
563
564
    np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
    np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
    np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)

    expected_label = np.array([1, 2], dtype=np.float32)
    np_assert_array_equal(dtrain.label, expected_label, strict=True)
    np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
    np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)

565
    if getenv("TASK", "") != "cuda":
566
567
568
569
        expected_position = np.array([0.0, 1.0], dtype=np.float32)
        np_assert_array_equal(dtrain.position, expected_position, strict=True)
        np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
        # NOTE: "position" is converted to int32 on the C++ side
570
        np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
571
572
573
574
575
576
577

    expected_weight = np.array([0.5, 1.5], dtype=np.float32)
    np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
    np_assert_array_equal(dtrain.get_weight(), expected_weight, strict=True)
    np_assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)


578
def test_dataset_construction_with_high_cardinality_categorical_succeeds(rng):
579
    pd = pytest.importorskip("pandas")
580
581
    X = pd.DataFrame({"x1": rng.integers(low=0, high=5_000, size=(10_000,))})
    y = rng.uniform(size=(10_000,))
582
583
584
585
586
587
    ds = lgb.Dataset(X, y, categorical_feature=["x1"])
    ds.construct()
    assert ds.num_data() == 10_000
    assert ds.num_feature() == 1


588
589
590
591
592
def test_choose_param_value():
    original_params = {
        "local_listen_port": 1234,
        "port": 2222,
        "metric": "auc",
593
594
        "num_trees": 81,
        "n_iter": 13,
595
596
597
598
    }

    # should resolve duplicate aliases, and prefer the main parameter
    params = lgb.basic._choose_param_value(
599
        main_param_name="local_listen_port", params=original_params, default_value=5555
600
601
602
603
    )
    assert params["local_listen_port"] == 1234
    assert "port" not in params

604
605
    # should choose the highest priority alias and set that value on main param
    # if only aliases are used
606
    params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
607
    assert params["num_iterations"] == 13
608
    assert "num_trees" not in params
609
    assert "n_iter" not in params
610
611

    # should use the default if main param and aliases are missing
612
    params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
613
614
615
616
617
618
619
    assert params["learning_rate"] == 0.789

    # all changes should be made on copies and not modify the original
    expected_params = {
        "local_listen_port": 1234,
        "port": 2222,
        "metric": "auc",
620
621
        "num_trees": 81,
        "n_iter": 13,
622
623
    }
    assert original_params == expected_params
624
625


626
627
628
629
def test_choose_param_value_preserves_nones():
    # preserves None found for main param and still removes aliases
    params = lgb.basic._choose_param_value(
        main_param_name="num_threads",
630
631
        params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
        default_value=2,
632
633
634
635
636
    )
    assert params == {"num_threads": None, "objective": "regression"}

    # correctly chooses value when only an alias is provided
    params = lgb.basic._choose_param_value(
637
        main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
638
639
640
641
642
    )
    assert params == {"num_threads": None, "objective": "regression"}

    # adds None if that's given as the default and param not found
    params = lgb.basic._choose_param_value(
643
        main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
644
645
646
647
    )
    assert params == {"objective": "regression", "min_data_in_leaf": None}


648
649
650
651
@pytest.mark.parametrize("objective_alias", lgb.basic._ConfigAliases.get("objective"))
def test_choose_param_value_objective(objective_alias):
    # If callable is found in objective
    params = {objective_alias: dummy_obj}
652
653
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
    assert params["objective"] == dummy_obj
654
655
656

    # Value in params should be preferred to the default_value passed from keyword arguments
    params = {objective_alias: dummy_obj}
657
658
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
    assert params["objective"] == dummy_obj
659
660
661

    # None of objective or its aliases in params, but default_value is callable.
    params = {}
662
663
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
    assert params["objective"] == mse_obj
664
665


666
667
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
668
def test_list_to_1d_numpy(collection, dtype, rng):
669
    collection2y = {
670
671
672
        "1d_np": rng.uniform(size=(10,)),
        "2d_np": rng.uniform(size=(10, 1)),
        "pd_float": rng.uniform(size=(10,)),
673
674
675
        "pd_str": ["a", "b"],
        "1d_list": [1] * 10,
        "2d_list": [[1], [2]],
676
677
    }
    y = collection2y[collection]
678
679
    custom_name = "my_custom_variable"

680
    if collection.startswith("pd"):
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
        pd = pytest.importorskip("pandas")
        y = pd_Series(y)
        if pd.api.types.is_object_dtype(y):
            with pytest.raises(
                ValueError,
                match=r"pandas dtypes must be int, float or bool\.\nFields with bad pandas dtypes: 0: object",
            ):
                lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name)
            return
        elif pd.api.types.is_string_dtype(y):
            with pytest.raises(
                ValueError, match=r"pandas dtypes must be int, float or bool\.\nFields with bad pandas dtypes: 0: str"
            ):
                lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name)
            return

697
    if isinstance(y, np.ndarray) and len(y.shape) == 2:
698
        with pytest.warns(UserWarning, match="column-vector"):
699
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name)
700
701
        return
    elif isinstance(y, list) and isinstance(y[0], list):
702
703
704
705
706
707
        err_msg = (
            rf"Wrong type\(list\) for {custom_name}.\n"
            r"It should be list, numpy 1-D array or pandas Series"
        )
        with pytest.raises(TypeError, match=err_msg):
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name=custom_name)
708
        return
709

710
    result = lgb.basic._list_to_1d_numpy(y, dtype=dtype, name=custom_name)
711
712
    assert result.size == 10
    assert result.dtype == dtype
713
714


715
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
716
def test_init_score_for_multiclass_classification(init_score_type, rng):
717
    init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
718
    if init_score_type == "array":
719
        init_score = np.array(init_score)
720
    elif init_score_type == "dataframe":
721
        if not PANDAS_INSTALLED:
722
            pytest.skip("Pandas is not installed.")
723
        init_score = pd_DataFrame(init_score)
724
    data = rng.uniform(size=(10, 2))
725
    ds = lgb.Dataset(data, init_score=init_score).construct()
726
    np.testing.assert_equal(ds.get_field("init_score"), init_score)
727
    np.testing.assert_equal(ds.init_score, init_score)
728
729
730


def test_smoke_custom_parser(tmp_path):
731
732
733
    data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
    parser_config_file = tmp_path / "parser.ini"
    with open(parser_config_file, "w") as fout:
734
735
736
        fout.write('{"className": "dummy", "id": "1"}')

    data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
737
738
739
    with pytest.raises(
        lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
    ):
740
        data.construct()
741
742
743
744
745
746


def test_param_aliases():
    aliases = lgb.basic._ConfigAliases.aliases
    assert isinstance(aliases, dict)
    assert len(aliases) > 100
747
    assert all(isinstance(i, list) for i in aliases.values())
748
749
    assert all(len(i) >= 1 for i in aliases.values())
    assert all(k in v for k, v in aliases.items())
750
751
752
753
754
755
756
    assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
    assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
        "min_data_in_leaf",
        "min_data",
        "min_samples_leaf",
        "min_child_samples",
        "min_data_per_leaf",
757
    ]
758
759
760


def _bad_gradients(preds, _):
761
762
763
764
    rng = np.random.default_rng()
    # "bad" = 1 element too many
    size = (len(preds) + 1,)
    return rng.standard_normal(size=size), rng.uniform(size=size)
765
766
767


def _good_gradients(preds, _):
768
769
    rng = np.random.default_rng()
    return rng.standard_normal(size=preds.shape), rng.uniform(size=preds.shape)
770
771


772
def test_custom_objective_safety(rng):
773
    nrows = 100
774
    X = rng.standard_normal(size=(nrows, 5))
775
776
777
778
779
780
    y_binary = np.arange(nrows) % 2
    classes = [0, 1, 2]
    nclass = len(classes)
    y_multiclass = np.arange(nrows) % nclass
    ds_binary = lgb.Dataset(X, y_binary).construct()
    ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
781
782
783
784
    bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
    good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
    bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
    good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
785
786
787
788
789
790
    good_bst_binary.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
        bad_bst_binary.update(fobj=_bad_gradients)
    good_bst_multi.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape(f"number of models per one iteration ({nclass})")):
        bad_bst_multi.update(fobj=_bad_gradients)
791
792


793
794
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
795
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name, rng):
796
    pd = pytest.importorskip("pandas")
797
    X = rng.uniform(size=(10, 2)).astype(dtype)
798
799
800
801
    # copy=False is necessary because starting with pandas 3.0, pd.DataFrame() creates
    # a copy of the input numpy array by default
    # ref: https://github.com/pandas-dev/pandas/issues/58913
    df = pd.DataFrame(X, copy=False)
802
    built_data = lgb.basic._data_from_pandas(
803
        data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
804
    )[0]
805
806
    assert built_data.dtype == dtype
    assert np.shares_memory(X, built_data)
807
808


809
810
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize("categories", ["seen", "unseen"])
811
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories, rng):
812
    pd = pytest.importorskip("pandas")
813
    X = rng.choice(a=["a", "b"], size=(100, 1))
814
815
816
817
    column_name = "a" if feature_name == "auto" else feature_name[0]
    df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
    if categories == "seen":
        pandas_categorical = [["a", "b"]]
818
    else:
819
        pandas_categorical = [["a"]]
820
821
822
823
    data = lgb.basic._data_from_pandas(
        data=df,
        feature_name=feature_name,
        categorical_feature="auto",
824
        pandas_categorical=pandas_categorical,
825
    )[0]
826
    # check that the original data wasn't modified
827
    np.testing.assert_equal(df[column_name], X[:, 0])
828
    # check that the built data has the codes
829
    if categories == "seen":
830
831
832
833
834
        # if all categories were seen during training we just take the codes
        codes = df[column_name].cat.codes
    else:
        # if we only saw 'a' during training we just replace its code
        # and leave the rest as nan
835
836
        a_code = df[column_name].cat.categories.get_loc("a")
        codes = np.where(df[column_name] == "a", a_code, np.nan)
837
    np.testing.assert_equal(codes, data[:, 0])
838
839


840
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
841
def test_feature_num_bin(min_data_in_bin, rng):
842
843
    X = np.vstack(
        [
844
            rng.uniform(size=(100,)),
845
846
847
848
            np.array([1, 2] * 50),
            np.array([0, 1, 2] * 33 + [0]),
            np.array([1, 2] * 49 + 2 * [np.nan]),
            np.zeros(100),
849
            rng.choice(a=[0, 1], size=(100,)),
850
851
        ]
    ).T
852
    n_continuous = X.shape[1] - 1
853
    feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
854
    ds_kwargs = {
855
        "params": {"min_data_in_bin": min_data_in_bin},
856
857
        "categorical_feature": [n_continuous],  # last feature
    }
858
    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
859
860
861
862
863
864
    expected_num_bins = [
        100 // min_data_in_bin + 1,  # extra bin for zero
        3,  # 0, 1, 2
        3,  # 0, 1, 2
        4,  # 0, 1, 2 + nan
        0,  # unused
865
        3,  # 0, 1 + nan
866
867
868
    ]
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    assert actual_num_bins == expected_num_bins
869
870
871
872
873
    # test using defined feature names
    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
    assert bins_by_name == expected_num_bins
    # test using default feature names
    ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
874
    default_names = [f"Column_{i}" for i in range(X.shape[1])]
875
876
    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
    assert bins_by_default_name == expected_num_bins
877
878
879
880
881
    # check for feature indices outside of range
    num_features = X.shape[1]
    with pytest.raises(
        lgb.basic.LightGBMError,
        match=(
882
883
884
            f"Tried to retrieve number of bins for feature index {num_features}, "
            f"but the valid feature indices are \\[0, {num_features - 1}\\]."
        ),
885
886
    ):
        ds.feature_num_bin(num_features)
887
888


889
890
891
def test_feature_num_bin_with_max_bin_by_feature(rng):
    X = rng.uniform(size=(100, 3))
    max_bin_by_feature = rng.integers(low=3, high=30, size=X.shape[1])
892
    ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
893
894
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
895
896
897
898
899


def test_set_leaf_output():
    X, y = load_breast_cancer(return_X_y=True)
    ds = lgb.Dataset(X, y)
900
    bst = lgb.Booster({"num_leaves": 2}, ds)
901
902
903
904
905
906
    bst.update()
    y_pred = bst.predict(X)
    for leaf_id in range(2):
        leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
        bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
    np.testing.assert_allclose(bst.predict(X), y_pred + 1)
907
908


909
def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(rng):
910
    ds = lgb.Dataset(
911
        data=rng.standard_normal(size=(100, 3)),
912
913
    )
    assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967


# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    lgb.Booster(
        params={
            "objective": "binary",
            "max_depth": max_depth,
            "num_leaves": num_leaves,
            "num_iterations": 1,
            "verbose": 0,
        },
        train_set=lgb.Dataset(X, label=y),
    )
    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5,
#       a full depth-wise tree would have 2^5 = 32 leaves.
@pytest.mark.parametrize("max_depth", [1, 2, 3, 4])
def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    lgb.Booster(
        params={
            "objective": "binary",
            "max_depth": max_depth,
            "num_iterations": 1,
            "verbose": 0,
        },
        train_set=lgb.Dataset(X, label=y),
    )
    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9])
def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    lgb.Booster(
        params={
            "objective": "binary",
            "max_depth": max_depth,
            "num_iterations": 1,
            "verbose": 0,
        },
        train_set=lgb.Dataset(X, label=y),
    )
    expected_warning = (
        f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly "
        f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) "
        "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
    )
    assert expected_warning in capsys.readouterr().out
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989


@pytest.mark.parametrize("order", ["C", "F"])
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
    X = rng.random(size=(100, 3))
    X = np.require(X, dtype=dtype, requirements=order)
    X1d, layout = lgb.basic._np2d_to_np1d(X)
    if order == "F":
        assert layout == lgb.basic._C_API_IS_COL_MAJOR
    else:
        assert layout == lgb.basic._C_API_IS_ROW_MAJOR
    if dtype == "float32":
        assert np.shares_memory(X, X1d)
    else:
        # makes a copy
        assert not np.shares_memory(X, X1d)


def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
    # row-major dataset
    X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2)
990
991
    assert X_row.flags["C_CONTIGUOUS"]
    assert not X_row.flags["F_CONTIGUOUS"]
992
993
994
995
996
997
    ds_row = lgb.Dataset(X_row, y)
    ds_row_path = tmp_path / "ds_row.txt"
    ds_row._dump_text(ds_row_path)

    # col-major dataset
    X_col = np.asfortranarray(X_row)
998
999
    assert X_col.flags["F_CONTIGUOUS"]
    assert not X_col.flags["C_CONTIGUOUS"]
1000
1001
1002
1003
1004
1005
    ds_col = lgb.Dataset(X_col, y)
    ds_col_path = tmp_path / "ds_col.txt"
    ds_col._dump_text(ds_col_path)

    # check datasets are equal
    assert filecmp.cmp(ds_row_path, ds_col_path)
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019


def test_equal_datasets_from_one_and_several_matrices_w_different_layouts(rng, tmp_path):
    # several matrices
    mats = [np.require(rng.random(size=(100, 2)), requirements=order) for order in ("C", "F", "F", "C")]
    several_path = tmp_path / "several.txt"
    lgb.Dataset(mats)._dump_text(several_path)

    # one matrix
    mat = np.vstack(mats)
    one_path = tmp_path / "one.txt"
    lgb.Dataset(mat)._dump_text(one_path)

    assert filecmp.cmp(one_path, several_path)