test_basic.py 39.4 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
3
import filecmp
import numbers
4
import re
5
from copy import deepcopy
6
from os import getenv
7
from pathlib import Path
wxchan's avatar
wxchan committed
8

wxchan's avatar
wxchan committed
9
import numpy as np
10
import pytest
11
from scipy import sparse
12
from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
wxchan's avatar
wxchan committed
13
from sklearn.model_selection import train_test_split
wxchan's avatar
wxchan committed
14

15
import lightgbm as lgb
16
from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
17

18
from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
19

wxchan's avatar
wxchan committed
20

21
def test_basic(tmp_path):
22
23
24
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
25
26
27
    feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
    feature_names[1] = "a" * 1000  # set one name to a value longer than default buffer size
    train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
28
29
30
31
32
33
34
35
36
37
    valid_data = train_data.create_valid(X_test, label=y_test)

    params = {
        "objective": "binary",
        "metric": "auc",
        "min_data": 10,
        "num_leaves": 15,
        "verbose": -1,
        "num_threads": 1,
        "max_bin": 255,
38
        "gpu_use_dp": True,
39
40
41
42
43
44
45
46
47
    }
    bst = lgb.Booster(params, train_data)
    bst.add_valid(valid_data, "valid_1")

    for i in range(20):
        bst.update()
        if i % 10 == 0:
            print(bst.eval_train(), bst.eval_valid())

48
49
    assert train_data.get_feature_name() == feature_names

50
51
52
    assert bst.current_iteration() == 20
    assert bst.num_trees() == 20
    assert bst.num_model_per_iteration() == 1
53
    if getenv("TASK", "") != "cuda":
54
55
        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
        assert bst.upper_bound() == pytest.approx(3.3182142872462883)
56

57
58
    tname = tmp_path / "svm_light.dat"
    model_file = tmp_path / "model.txt"
59
60
61
62
63
64
65
66
67
68

    bst.save_model(model_file)
    pred_from_matr = bst.predict(X_test)
    with open(tname, "w+b") as f:
        dump_svmlight_file(X_test, y_test, f)
    pred_from_file = bst.predict(tname)
    np.testing.assert_allclose(pred_from_matr, pred_from_file)

    # check saved model persistence
    bst = lgb.Booster(params, model_file=model_file)
69
    assert bst.feature_name() == feature_names
70
71
72
73
74
75
76
77
78
79
80
81
82
    pred_from_model_file = bst.predict(X_test)
    # we need to check the consistency of model file here, so test for exact equal
    np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)

    # check early stopping is working. Make it stop very early, so the scores should be very close to zero
    pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
    pred_early_stopping = bst.predict(X_test, **pred_parameter)
    # scores likely to be different, but prediction should still be the same
    np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))

    # test that shape is checked during prediction
    bad_X_test = X_test[:, 1:]
    bad_shape_error_msg = "The number of features in data*"
83
84
85
86
87
88
89
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
    np.testing.assert_raises_regex(
        lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
    )
    np.testing.assert_raises_regex(
        lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
    )
90
91
    with open(tname, "w+b") as f:
        dump_svmlight_file(bad_X_test, y_test, f)
92
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
93
94
    with open(tname, "w+b") as f:
        dump_svmlight_file(X_test, y_test, f, zero_based=False)
95
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
96
97


98
99
100
101
102
103
104
105
106
107
108
109
110
class NumpySequence(lgb.Sequence):
    def __init__(self, ndarray, batch_size):
        self.ndarray = ndarray
        self.batch_size = batch_size

    def __getitem__(self, idx):
        # The simple implementation is just a single "return self.ndarray[idx]"
        # The following is for demo and testing purpose.
        if isinstance(idx, numbers.Integral):
            return self.ndarray[idx]
        elif isinstance(idx, slice):
            if not (idx.step is None or idx.step == 1):
                raise NotImplementedError("No need to implement, caller will not set step by now")
111
            return self.ndarray[idx.start : idx.stop]
112
113
        elif isinstance(idx, list):
            return self.ndarray[idx]
114
        else:
115
            raise TypeError(f"Sequence Index must be an integer/list/slice, got {type(idx).__name__}")
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

    def __len__(self):
        return len(self.ndarray)


def _create_sequence_from_ndarray(data, num_seq, batch_size):
    if num_seq == 1:
        return NumpySequence(data, batch_size)

    nrow = data.shape[0]
    seqs = []
    seq_size = nrow // num_seq
    for start in range(0, nrow, seq_size):
        end = min(start + seq_size, nrow)
        seq = NumpySequence(data[start:end], batch_size)
        seqs.append(seq)
    return seqs


135
136
137
138
@pytest.mark.parametrize("sample_count", [11, 100, None])
@pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize("num_seq", [1, 3])
139
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq, rng):
140
    params = {"bin_construct_sample_cnt": sample_count}
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

    nrow = 50
    half_nrow = nrow // 2
    ncol = 11
    data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))

    if include_0_and_nan:
        # whole col
        data[:, 0] = 0
        data[:, 1] = np.nan

        # half col
        data[:half_nrow, 3] = 0
        data[:half_nrow, 2] = np.nan

        data[half_nrow:-2, 4] = 0
        data[:half_nrow, 4] = np.nan

    X = data[:, :-1]
    Y = data[:, -1]

162
163
    npy_bin_fname = tmpdir / "data_from_npy.bin"
    seq_bin_fname = tmpdir / "data_from_seq.bin"
164
165
166
167
168
169
170
171
172
173
174
175
176
177

    # Create dataset from numpy array directly.
    ds = lgb.Dataset(X, label=Y, params=params)
    ds.save_binary(npy_bin_fname)

    # Create dataset using Sequence.
    seqs = _create_sequence_from_ndarray(X, num_seq, batch_size)
    seq_ds = lgb.Dataset(seqs, label=Y, params=params)
    seq_ds.save_binary(seq_bin_fname)

    assert filecmp.cmp(npy_bin_fname, seq_bin_fname)

    # Test for validation set.
    # Select some random rows as valid data.
178
    valid_idx = (rng.random(10) * nrow).astype(np.int32)
179
180
181
182
    valid_data = data[valid_idx, :]
    valid_X = valid_data[:, :-1]
    valid_Y = valid_data[:, -1]

183
184
185
    valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
    valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
    valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

    valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
    valid_ds.save_binary(valid_npy_bin_fname)

    # From Dataset constructor, with dataset from numpy array.
    valid_seqs = _create_sequence_from_ndarray(valid_X, num_seq, batch_size)
    valid_seq_ds = lgb.Dataset(valid_seqs, label=valid_Y, params=params, reference=ds)
    valid_seq_ds.save_binary(valid_seq_bin_fname)
    assert filecmp.cmp(valid_npy_bin_fname, valid_seq_bin_fname)

    # From Dataset.create_valid, with dataset from sequence.
    valid_seq_ds2 = seq_ds.create_valid(valid_seqs, label=valid_Y, params=params)
    valid_seq_ds2.save_binary(valid_seq2_bin_fname)
    assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)


202
@pytest.mark.parametrize("num_seq", [1, 2])
203
def test_sequence_get_data(num_seq, rng):
204
205
206
207
208
209
    nrow = 20
    ncol = 11
    data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))
    X = data[:, :-1]
    Y = data[:, -1]

210
211
212
    seqs = _create_sequence_from_ndarray(data=X, num_seq=num_seq, batch_size=6)
    seq_ds = lgb.Dataset(seqs, label=Y, params=None, free_raw_data=False).construct()
    assert seq_ds.get_data() == seqs
213

214
    used_indices = rng.choice(a=np.arange(nrow), size=nrow // 3, replace=False)
215
216
    subset_data = seq_ds.subset(used_indices).construct()
    np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)])
217
218


219
def test_chunked_dataset():
220
221
222
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
223
224

    chunk_size = X_train.shape[0] // 10 + 1
225
226
    X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
    X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
227
228
229
230
231
232
233
234

    train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
    valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
    train_data.construct()
    valid_data.construct()


def test_chunked_dataset_linear():
235
236
237
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
238
    chunk_size = X_train.shape[0] // 10 + 1
239
240
241
    X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
    X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
    params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
242
243
244
245
246
247
    train_data = lgb.Dataset(X_train, label=y_train, params=params)
    valid_data = train_data.create_valid(X_test, label=y_test, params=params)
    train_data.construct()
    valid_data.construct()


248
249
def test_save_dataset_subset_and_load_from_file(tmp_path, rng):
    data = rng.standard_normal(size=(100, 2))
250
    params = {"max_bin": 50, "min_data_in_bin": 10}
251
    ds = lgb.Dataset(data, params=params)
252
253
    ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
    lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
254
255


256
def test_subset_group():
257
258
259
    rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
    X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
    q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
260
261
262
263
264
265
266
267
268
    lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
    assert len(lgb_train.get_group()) == 201
    subset = lgb_train.subset(list(range(10))).construct()
    subset_group = subset.get_group()
    assert len(subset_group) == 2
    assert subset_group[0] == 1
    assert subset_group[1] == 9


269
270
271
def test_add_features_throws_if_num_data_unequal(rng):
    X1 = rng.uniform(size=(100, 1))
    X2 = rng.uniform(size=(10, 1))
272
273
274
275
276
277
    d1 = lgb.Dataset(X1).construct()
    d2 = lgb.Dataset(X2).construct()
    with pytest.raises(lgb.basic.LightGBMError):
        d1.add_features_from(d2)


278
279
280
def test_add_features_throws_if_datasets_unconstructed(rng):
    X1 = rng.uniform(size=(100, 1))
    X2 = rng.uniform(size=(100, 1))
281
282
283
284
285
    with pytest.raises(ValueError):
        d1 = lgb.Dataset(X1)
        d2 = lgb.Dataset(X2)
        d1.add_features_from(d2)
    with pytest.raises(ValueError):
286
        d1 = lgb.Dataset(X1).construct()
287
288
289
290
        d2 = lgb.Dataset(X2)
        d1.add_features_from(d2)
    with pytest.raises(ValueError):
        d1 = lgb.Dataset(X1)
291
        d2 = lgb.Dataset(X2).construct()
292
        d1.add_features_from(d2)
293
294


295
296
def test_add_features_equal_data_on_alternating_used_unused(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
297
    X[:, [1, 3]] = 0
298
    names = [f"col_{i}" for i in range(5)]
299
300
301
302
    for j in range(1, 5):
        d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
        d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
        d1.add_features_from(d2)
303
        d1name = tmp_path / "d1.txt"
304
305
        d1._dump_text(d1name)
        d = lgb.Dataset(X, feature_name=names).construct()
306
        dname = tmp_path / "d.txt"
307
        d._dump_text(dname)
308
        with open(d1name, "rt") as d1f:
309
            d1txt = d1f.read()
310
        with open(dname, "rt") as df:
311
312
            dtxt = df.read()
        assert dtxt == d1txt
Guolin Ke's avatar
Guolin Ke committed
313

314

315
316
def test_add_features_same_booster_behaviour(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
317
    X[:, [1, 3]] = 0
318
    names = [f"col_{i}" for i in range(5)]
319
320
321
322
323
    for j in range(1, 5):
        d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
        d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
        d1.add_features_from(d2)
        d = lgb.Dataset(X, feature_name=names).construct()
324
        y = rng.uniform(size=(100,))
325
326
327
328
        d1.set_label(y)
        d.set_label(y)
        b1 = lgb.Booster(train_set=d1)
        b = lgb.Booster(train_set=d)
329
        for _ in range(10):
330
331
            b.update()
            b1.update()
332
333
        dname = tmp_path / "d.txt"
        d1name = tmp_path / "d1.txt"
334
335
        b1.save_model(d1name)
        b.save_model(dname)
336
        with open(dname, "rt") as df:
337
            dtxt = df.read()
338
        with open(d1name, "rt") as d1f:
339
340
341
342
            d1txt = d1f.read()
        assert dtxt == d1txt


343
def test_add_features_from_different_sources(rng):
344
345
346
    pd = pytest.importorskip("pandas")
    n_row = 100
    n_col = 5
347
    X = rng.uniform(size=(n_row, n_col))
348
    xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
349
    names = [f"col_{i}" for i in range(n_col)]
350
351
    seq = _create_sequence_from_ndarray(X, 1, 30)
    seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
352
353
354
    npy_list_ds = lgb.Dataset(
        [X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
    ).construct()
355
    immergeable_dds = [seq_ds, npy_list_ds]
356
357
358
359
360
361
362
363
364
    for x_1 in xxs:
        # test that method works even with free_raw_data=True
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
        d2 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
        d1.add_features_from(d2)
        assert d1.data is None

        # test that method works but sets raw data to None in case of immergeable data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
365
366
367
        for d2 in immergeable_dds:
            d1.add_features_from(d2)
            assert d1.data is None
368
369
370

        # test that method works for different data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
371
        res_feature_names = deepcopy(names)
372
373
374
375
376
377
        for idx, x_2 in enumerate(xxs, 2):
            original_type = type(d1.get_data())
            d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
            d1.add_features_from(d2)
            assert isinstance(d1.get_data(), original_type)
            assert d1.get_data().shape == (n_row, n_col * idx)
378
            res_feature_names += [f"D{idx}_{name}" for name in names]
379
380
381
            assert d1.feature_name == res_feature_names


382
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys, rng):
383
    arr_a = np.zeros((100, 1), dtype=np.float32)
384
    arr_b = rng.uniform(size=(100, 5))
385

386
    dataset_a = lgb.Dataset(arr_a, params={"verbose": 0}).construct()
387
    expected_msg = (
388
389
390
        "[LightGBM] [Warning] There are no meaningful features which satisfy "
        "the provided configuration. Decreasing Dataset parameters min_data_in_bin "
        "or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
391
392
393
394
395
396
    )
    log_lines = capsys.readouterr().out
    assert expected_msg in log_lines

    dataset_b = lgb.Dataset(arr_b).construct()

397
    original_handle = dataset_a._handle.value
398
399
400
    dataset_a.add_features_from(dataset_b)
    assert dataset_a.num_feature() == 6
    assert dataset_a.num_data() == 100
401
    assert dataset_a._handle.value == original_handle
402
403


404
405
def test_cegb_affects_behavior(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
406
    X[:, [1, 3]] = 0
407
    y = rng.uniform(size=(100,))
408
    names = [f"col_{i}" for i in range(5)]
409
410
411
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    base = lgb.Booster(train_set=ds)
412
    for _ in range(10):
413
        base.update()
414
    basename = tmp_path / "basename.txt"
415
    base.save_model(basename)
416
    with open(basename, "rt") as f:
417
418
        basetxt = f.read()
    # Set extremely harsh penalties, so CEGB will block most splits.
419
420
421
422
423
    cases = [
        {"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
        {"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
        {"cegb_penalty_split": 1},
    ]
424
425
    for case in cases:
        booster = lgb.Booster(train_set=ds, params=case)
426
        for _ in range(10):
427
            booster.update()
428
        casename = tmp_path / "casename.txt"
429
        booster.save_model(casename)
430
        with open(casename, "rt") as f:
431
432
433
434
            casetxt = f.read()
        assert basetxt != casetxt


435
436
def test_cegb_scaling_equalities(tmp_path, rng):
    X = rng.uniform(size=(100, 5))
437
    X[:, [1, 3]] = 0
438
    y = rng.uniform(size=(100,))
439
    names = [f"col_{i}" for i in range(5)]
440
441
442
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    # Compare pairs of penalties, to ensure scaling works as intended
443
444
445
446
447
448
449
450
451
452
453
454
    pairs = [
        (
            {"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
            {"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
        ),
        (
            {"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
            {"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
        ),
        ({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
    ]
    for p1, p2 in pairs:
455
456
        booster1 = lgb.Booster(train_set=ds, params=p1)
        booster2 = lgb.Booster(train_set=ds, params=p2)
457
        for _ in range(10):
458
459
            booster1.update()
            booster2.update()
460
        p1name = tmp_path / "p1.txt"
461
462
463
        # Reset booster1's parameters to p2, so the parameter section of the file matches.
        booster1.reset_parameter(p2)
        booster1.save_model(p1name)
464
        with open(p1name, "rt") as f:
465
            p1txt = f.read()
466
        p2name = tmp_path / "p2.txt"
467
        booster2.save_model(p2name)
468
        with open(p2name, "rt") as f:
469
470
471
472
473
474
475
            p2txt = f.read()
        assert p1txt == p2txt


def test_consistent_state_for_dataset_fields():
    def check_asserts(data):
        np.testing.assert_allclose(data.label, data.get_label())
476
        np.testing.assert_allclose(data.label, data.get_field("label"))
477
478
479
        assert not np.isnan(data.label[0])
        assert not np.isinf(data.label[1])
        np.testing.assert_allclose(data.weight, data.get_weight())
480
        np.testing.assert_allclose(data.weight, data.get_field("weight"))
481
482
483
        assert not np.isnan(data.weight[0])
        assert not np.isinf(data.weight[1])
        np.testing.assert_allclose(data.init_score, data.get_init_score())
484
        np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
485
486
        assert not np.isnan(data.init_score[0])
        assert not np.isinf(data.init_score[1])
487
        assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
488
489
490
491
492
493
494
        assert data.label[1] == pytest.approx(data.weight[1])
        assert data.feature_name == data.get_feature_name()

    X, y = load_breast_cancer(return_X_y=True)
    sequence = np.ones(y.shape[0])
    sequence[0] = np.nan
    sequence[1] = np.inf
495
496
    feature_names = [f"f{i}" for i in range(X.shape[1])]
    lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
497
498
499
500
501
502
503
    check_asserts(lgb_data)
    lgb_data = lgb.Dataset(X, y).construct()
    lgb_data.set_label(sequence)
    lgb_data.set_weight(sequence)
    lgb_data.set_init_score(sequence)
    lgb_data.set_feature_name(feature_names)
    check_asserts(lgb_data)
504
505


506
507
508
509
def test_dataset_construction_overwrites_user_provided_metadata_fields():
    X = np.array([[1.0, 2.0], [3.0, 4.0]])

    position = np.array([0.0, 1.0], dtype=np.float32)
510
    if getenv("TASK", "") == "cuda":
511
512
513
514
        position = None

    dtrain = lgb.Dataset(
        X,
515
        params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
516
517
518
519
520
521
522
523
524
525
526
527
528
529
        group=[1, 1],
        init_score=[0.312, 0.708],
        label=[1, 2],
        position=position,
        weight=[0.5, 1.5],
    )

    # unconstructed, get_* methods should return whatever was provided
    assert dtrain.group == [1, 1]
    assert dtrain.get_group() == [1, 1]
    assert dtrain.init_score == [0.312, 0.708]
    assert dtrain.get_init_score() == [0.312, 0.708]
    assert dtrain.label == [1, 2]
    assert dtrain.get_label() == [1, 2]
530
531
532
    if getenv("TASK", "") != "cuda":
        np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
        np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
    assert dtrain.weight == [0.5, 1.5]
    assert dtrain.get_weight() == [0.5, 1.5]

    # before construction, get_field() should raise an exception
    for field_name in ["group", "init_score", "label", "position", "weight"]:
        with pytest.raises(Exception, match=f"Cannot get {field_name} before construct Dataset"):
            dtrain.get_field(field_name)

    # constructed, get_* methods should return numpy arrays, even when the provided
    # input was a list of floats or ints
    dtrain.construct()
    expected_group = np.array([1, 1], dtype=np.int32)
    np_assert_array_equal(dtrain.group, expected_group, strict=True)
    np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
    # get_field("group") returns a numpy array with boundaries, instead of size
548
    np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
549

550
551
552
    expected_init_score = np.array(
        [0.312, 0.708],
    )
553
554
555
556
557
558
559
560
561
    np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
    np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
    np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)

    expected_label = np.array([1, 2], dtype=np.float32)
    np_assert_array_equal(dtrain.label, expected_label, strict=True)
    np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
    np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)

562
    if getenv("TASK", "") != "cuda":
563
564
565
566
        expected_position = np.array([0.0, 1.0], dtype=np.float32)
        np_assert_array_equal(dtrain.position, expected_position, strict=True)
        np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
        # NOTE: "position" is converted to int32 on the C++ side
567
        np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
568
569
570
571
572
573
574

    expected_weight = np.array([0.5, 1.5], dtype=np.float32)
    np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
    np_assert_array_equal(dtrain.get_weight(), expected_weight, strict=True)
    np_assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)


575
def test_dataset_construction_with_high_cardinality_categorical_succeeds(rng):
576
    pd = pytest.importorskip("pandas")
577
578
    X = pd.DataFrame({"x1": rng.integers(low=0, high=5_000, size=(10_000,))})
    y = rng.uniform(size=(10_000,))
579
580
581
582
583
584
    ds = lgb.Dataset(X, y, categorical_feature=["x1"])
    ds.construct()
    assert ds.num_data() == 10_000
    assert ds.num_feature() == 1


585
586
587
588
589
def test_choose_param_value():
    original_params = {
        "local_listen_port": 1234,
        "port": 2222,
        "metric": "auc",
590
591
        "num_trees": 81,
        "n_iter": 13,
592
593
594
595
    }

    # should resolve duplicate aliases, and prefer the main parameter
    params = lgb.basic._choose_param_value(
596
        main_param_name="local_listen_port", params=original_params, default_value=5555
597
598
599
600
    )
    assert params["local_listen_port"] == 1234
    assert "port" not in params

601
602
    # should choose the highest priority alias and set that value on main param
    # if only aliases are used
603
    params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
604
    assert params["num_iterations"] == 13
605
    assert "num_trees" not in params
606
    assert "n_iter" not in params
607
608

    # should use the default if main param and aliases are missing
609
    params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
610
611
612
613
614
615
616
    assert params["learning_rate"] == 0.789

    # all changes should be made on copies and not modify the original
    expected_params = {
        "local_listen_port": 1234,
        "port": 2222,
        "metric": "auc",
617
618
        "num_trees": 81,
        "n_iter": 13,
619
620
    }
    assert original_params == expected_params
621
622


623
624
625
626
def test_choose_param_value_preserves_nones():
    # preserves None found for main param and still removes aliases
    params = lgb.basic._choose_param_value(
        main_param_name="num_threads",
627
628
        params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
        default_value=2,
629
630
631
632
633
    )
    assert params == {"num_threads": None, "objective": "regression"}

    # correctly chooses value when only an alias is provided
    params = lgb.basic._choose_param_value(
634
        main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
635
636
637
638
639
    )
    assert params == {"num_threads": None, "objective": "regression"}

    # adds None if that's given as the default and param not found
    params = lgb.basic._choose_param_value(
640
        main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
641
642
643
644
    )
    assert params == {"objective": "regression", "min_data_in_leaf": None}


645
646
647
648
@pytest.mark.parametrize("objective_alias", lgb.basic._ConfigAliases.get("objective"))
def test_choose_param_value_objective(objective_alias):
    # If callable is found in objective
    params = {objective_alias: dummy_obj}
649
650
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
    assert params["objective"] == dummy_obj
651
652
653

    # Value in params should be preferred to the default_value passed from keyword arguments
    params = {objective_alias: dummy_obj}
654
655
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
    assert params["objective"] == dummy_obj
656
657
658

    # None of objective or its aliases in params, but default_value is callable.
    params = {}
659
660
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
    assert params["objective"] == mse_obj
661
662


663
664
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
665
def test_list_to_1d_numpy(collection, dtype, rng):
666
    collection2y = {
667
668
669
        "1d_np": rng.uniform(size=(10,)),
        "2d_np": rng.uniform(size=(10, 1)),
        "pd_float": rng.uniform(size=(10,)),
670
671
672
        "pd_str": ["a", "b"],
        "1d_list": [1] * 10,
        "2d_list": [[1], [2]],
673
674
    }
    y = collection2y[collection]
675
    if collection.startswith("pd"):
676
        if not PANDAS_INSTALLED:
677
            pytest.skip("pandas is not installed")
678
679
        else:
            y = pd_Series(y)
680
    if isinstance(y, np.ndarray) and len(y.shape) == 2:
681
        with pytest.warns(UserWarning, match="column-vector"):
682
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
683
684
685
        return
    elif isinstance(y, list) and isinstance(y[0], list):
        with pytest.raises(TypeError):
686
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
687
688
689
        return
    elif isinstance(y, pd_Series) and y.dtype == object:
        with pytest.raises(ValueError):
690
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
691
        return
692
    result = lgb.basic._list_to_1d_numpy(y, dtype=dtype, name="list")
693
694
    assert result.size == 10
    assert result.dtype == dtype
695
696


697
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
698
def test_init_score_for_multiclass_classification(init_score_type, rng):
699
    init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
700
    if init_score_type == "array":
701
        init_score = np.array(init_score)
702
    elif init_score_type == "dataframe":
703
        if not PANDAS_INSTALLED:
704
            pytest.skip("Pandas is not installed.")
705
        init_score = pd_DataFrame(init_score)
706
    data = rng.uniform(size=(10, 2))
707
    ds = lgb.Dataset(data, init_score=init_score).construct()
708
    np.testing.assert_equal(ds.get_field("init_score"), init_score)
709
    np.testing.assert_equal(ds.init_score, init_score)
710
711
712


def test_smoke_custom_parser(tmp_path):
713
714
715
    data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
    parser_config_file = tmp_path / "parser.ini"
    with open(parser_config_file, "w") as fout:
716
717
718
        fout.write('{"className": "dummy", "id": "1"}')

    data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
719
720
721
    with pytest.raises(
        lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
    ):
722
        data.construct()
723
724
725
726
727
728


def test_param_aliases():
    aliases = lgb.basic._ConfigAliases.aliases
    assert isinstance(aliases, dict)
    assert len(aliases) > 100
729
    assert all(isinstance(i, list) for i in aliases.values())
730
731
    assert all(len(i) >= 1 for i in aliases.values())
    assert all(k in v for k, v in aliases.items())
732
733
734
735
736
737
738
    assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
    assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
        "min_data_in_leaf",
        "min_data",
        "min_samples_leaf",
        "min_child_samples",
        "min_data_per_leaf",
739
    ]
740
741
742


def _bad_gradients(preds, _):
743
744
745
746
    rng = np.random.default_rng()
    # "bad" = 1 element too many
    size = (len(preds) + 1,)
    return rng.standard_normal(size=size), rng.uniform(size=size)
747
748
749


def _good_gradients(preds, _):
750
751
    rng = np.random.default_rng()
    return rng.standard_normal(size=preds.shape), rng.uniform(size=preds.shape)
752
753


754
def test_custom_objective_safety(rng):
755
    nrows = 100
756
    X = rng.standard_normal(size=(nrows, 5))
757
758
759
760
761
762
    y_binary = np.arange(nrows) % 2
    classes = [0, 1, 2]
    nclass = len(classes)
    y_multiclass = np.arange(nrows) % nclass
    ds_binary = lgb.Dataset(X, y_binary).construct()
    ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
763
764
765
766
    bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
    good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
    bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
    good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
767
768
769
770
771
772
    good_bst_binary.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
        bad_bst_binary.update(fobj=_bad_gradients)
    good_bst_multi.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape(f"number of models per one iteration ({nclass})")):
        bad_bst_multi.update(fobj=_bad_gradients)
773
774


775
776
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
777
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name, rng):
778
    pd = pytest.importorskip("pandas")
779
    X = rng.uniform(size=(10, 2)).astype(dtype)
780
781
782
783
    # copy=False is necessary because starting with pandas 3.0, pd.DataFrame() creates
    # a copy of the input numpy array by default
    # ref: https://github.com/pandas-dev/pandas/issues/58913
    df = pd.DataFrame(X, copy=False)
784
    built_data = lgb.basic._data_from_pandas(
785
        data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
786
    )[0]
787
788
    assert built_data.dtype == dtype
    assert np.shares_memory(X, built_data)
789
790


791
792
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize("categories", ["seen", "unseen"])
793
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories, rng):
794
    pd = pytest.importorskip("pandas")
795
    X = rng.choice(a=["a", "b"], size=(100, 1))
796
797
798
799
    column_name = "a" if feature_name == "auto" else feature_name[0]
    df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
    if categories == "seen":
        pandas_categorical = [["a", "b"]]
800
    else:
801
        pandas_categorical = [["a"]]
802
803
804
805
    data = lgb.basic._data_from_pandas(
        data=df,
        feature_name=feature_name,
        categorical_feature="auto",
806
        pandas_categorical=pandas_categorical,
807
    )[0]
808
    # check that the original data wasn't modified
809
    np.testing.assert_equal(df[column_name], X[:, 0])
810
    # check that the built data has the codes
811
    if categories == "seen":
812
813
814
815
816
        # if all categories were seen during training we just take the codes
        codes = df[column_name].cat.codes
    else:
        # if we only saw 'a' during training we just replace its code
        # and leave the rest as nan
817
818
        a_code = df[column_name].cat.categories.get_loc("a")
        codes = np.where(df[column_name] == "a", a_code, np.nan)
819
    np.testing.assert_equal(codes, data[:, 0])
820
821


822
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
823
def test_feature_num_bin(min_data_in_bin, rng):
824
825
    X = np.vstack(
        [
826
            rng.uniform(size=(100,)),
827
828
829
830
            np.array([1, 2] * 50),
            np.array([0, 1, 2] * 33 + [0]),
            np.array([1, 2] * 49 + 2 * [np.nan]),
            np.zeros(100),
831
            rng.choice(a=[0, 1], size=(100,)),
832
833
        ]
    ).T
834
    n_continuous = X.shape[1] - 1
835
    feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
836
    ds_kwargs = {
837
        "params": {"min_data_in_bin": min_data_in_bin},
838
839
        "categorical_feature": [n_continuous],  # last feature
    }
840
    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
841
842
843
844
845
846
    expected_num_bins = [
        100 // min_data_in_bin + 1,  # extra bin for zero
        3,  # 0, 1, 2
        3,  # 0, 1, 2
        4,  # 0, 1, 2 + nan
        0,  # unused
847
        3,  # 0, 1 + nan
848
849
850
    ]
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    assert actual_num_bins == expected_num_bins
851
852
853
854
855
    # test using defined feature names
    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
    assert bins_by_name == expected_num_bins
    # test using default feature names
    ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
856
    default_names = [f"Column_{i}" for i in range(X.shape[1])]
857
858
    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
    assert bins_by_default_name == expected_num_bins
859
860
861
862
863
    # check for feature indices outside of range
    num_features = X.shape[1]
    with pytest.raises(
        lgb.basic.LightGBMError,
        match=(
864
865
866
            f"Tried to retrieve number of bins for feature index {num_features}, "
            f"but the valid feature indices are \\[0, {num_features - 1}\\]."
        ),
867
868
    ):
        ds.feature_num_bin(num_features)
869
870


871
872
873
def test_feature_num_bin_with_max_bin_by_feature(rng):
    X = rng.uniform(size=(100, 3))
    max_bin_by_feature = rng.integers(low=3, high=30, size=X.shape[1])
874
    ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
875
876
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
877
878
879
880
881


def test_set_leaf_output():
    X, y = load_breast_cancer(return_X_y=True)
    ds = lgb.Dataset(X, y)
882
    bst = lgb.Booster({"num_leaves": 2}, ds)
883
884
885
886
887
888
    bst.update()
    y_pred = bst.predict(X)
    for leaf_id in range(2):
        leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
        bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
    np.testing.assert_allclose(bst.predict(X), y_pred + 1)
889
890


891
def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(rng):
892
    ds = lgb.Dataset(
893
        data=rng.standard_normal(size=(100, 3)),
894
895
    )
    assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949


# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    lgb.Booster(
        params={
            "objective": "binary",
            "max_depth": max_depth,
            "num_leaves": num_leaves,
            "num_iterations": 1,
            "verbose": 0,
        },
        train_set=lgb.Dataset(X, label=y),
    )
    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5,
#       a full depth-wise tree would have 2^5 = 32 leaves.
@pytest.mark.parametrize("max_depth", [1, 2, 3, 4])
def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    lgb.Booster(
        params={
            "objective": "binary",
            "max_depth": max_depth,
            "num_iterations": 1,
            "verbose": 0,
        },
        train_set=lgb.Dataset(X, label=y),
    )
    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9])
def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth):
    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
    lgb.Booster(
        params={
            "objective": "binary",
            "max_depth": max_depth,
            "num_iterations": 1,
            "verbose": 0,
        },
        train_set=lgb.Dataset(X, label=y),
    )
    expected_warning = (
        f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly "
        f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) "
        "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
    )
    assert expected_warning in capsys.readouterr().out
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985


@pytest.mark.parametrize("order", ["C", "F"])
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_no_copy_in_dataset_from_numpy_2d(rng, order, dtype):
    X = rng.random(size=(100, 3))
    X = np.require(X, dtype=dtype, requirements=order)
    X1d, layout = lgb.basic._np2d_to_np1d(X)
    if order == "F":
        assert layout == lgb.basic._C_API_IS_COL_MAJOR
    else:
        assert layout == lgb.basic._C_API_IS_ROW_MAJOR
    if dtype == "float32":
        assert np.shares_memory(X, X1d)
    else:
        # makes a copy
        assert not np.shares_memory(X, X1d)


def test_equal_datasets_from_row_major_and_col_major_data(tmp_path):
    # row-major dataset
    X_row, y = make_blobs(n_samples=1_000, n_features=3, centers=2)
    assert X_row.flags["C_CONTIGUOUS"] and not X_row.flags["F_CONTIGUOUS"]
    ds_row = lgb.Dataset(X_row, y)
    ds_row_path = tmp_path / "ds_row.txt"
    ds_row._dump_text(ds_row_path)

    # col-major dataset
    X_col = np.asfortranarray(X_row)
    assert X_col.flags["F_CONTIGUOUS"] and not X_col.flags["C_CONTIGUOUS"]
    ds_col = lgb.Dataset(X_col, y)
    ds_col_path = tmp_path / "ds_col.txt"
    ds_col._dump_text(ds_col_path)

    # check datasets are equal
    assert filecmp.cmp(ds_row_path, ds_col_path)
986
987
988
989
990
991
992
993
994
995
996
997
998
999


def test_equal_datasets_from_one_and_several_matrices_w_different_layouts(rng, tmp_path):
    # several matrices
    mats = [np.require(rng.random(size=(100, 2)), requirements=order) for order in ("C", "F", "F", "C")]
    several_path = tmp_path / "several.txt"
    lgb.Dataset(mats)._dump_text(several_path)

    # one matrix
    mat = np.vstack(mats)
    one_path = tmp_path / "one.txt"
    lgb.Dataset(mat)._dump_text(one_path)

    assert filecmp.cmp(one_path, several_path)