test_basic.py 35 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
3
import filecmp
import numbers
4
import re
5
from copy import deepcopy
6
from os import getenv
7
from pathlib import Path
wxchan's avatar
wxchan committed
8

wxchan's avatar
wxchan committed
9
import numpy as np
10
import pytest
11
from scipy import sparse
12
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
wxchan's avatar
wxchan committed
13
from sklearn.model_selection import train_test_split
wxchan's avatar
wxchan committed
14

15
import lightgbm as lgb
16
from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
17

18
from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
19

wxchan's avatar
wxchan committed
20

21
def test_basic(tmp_path):
22
23
24
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
25
26
27
    feature_names = [f"Column_{i}" for i in range(X_train.shape[1])]
    feature_names[1] = "a" * 1000  # set one name to a value longer than default buffer size
    train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_names)
28
29
30
31
32
33
34
35
36
37
    valid_data = train_data.create_valid(X_test, label=y_test)

    params = {
        "objective": "binary",
        "metric": "auc",
        "min_data": 10,
        "num_leaves": 15,
        "verbose": -1,
        "num_threads": 1,
        "max_bin": 255,
38
        "gpu_use_dp": True,
39
40
41
42
43
44
45
46
47
    }
    bst = lgb.Booster(params, train_data)
    bst.add_valid(valid_data, "valid_1")

    for i in range(20):
        bst.update()
        if i % 10 == 0:
            print(bst.eval_train(), bst.eval_valid())

48
49
    assert train_data.get_feature_name() == feature_names

50
51
52
    assert bst.current_iteration() == 20
    assert bst.num_trees() == 20
    assert bst.num_model_per_iteration() == 1
53
    if getenv("TASK", "") != "cuda":
54
55
        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
        assert bst.upper_bound() == pytest.approx(3.3182142872462883)
56

57
58
    tname = tmp_path / "svm_light.dat"
    model_file = tmp_path / "model.txt"
59
60
61
62
63
64
65
66
67
68

    bst.save_model(model_file)
    pred_from_matr = bst.predict(X_test)
    with open(tname, "w+b") as f:
        dump_svmlight_file(X_test, y_test, f)
    pred_from_file = bst.predict(tname)
    np.testing.assert_allclose(pred_from_matr, pred_from_file)

    # check saved model persistence
    bst = lgb.Booster(params, model_file=model_file)
69
    assert bst.feature_name() == feature_names
70
71
72
73
74
75
76
77
78
79
80
81
82
    pred_from_model_file = bst.predict(X_test)
    # we need to check the consistency of model file here, so test for exact equal
    np.testing.assert_array_equal(pred_from_matr, pred_from_model_file)

    # check early stopping is working. Make it stop very early, so the scores should be very close to zero
    pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
    pred_early_stopping = bst.predict(X_test, **pred_parameter)
    # scores likely to be different, but prediction should still be the same
    np.testing.assert_array_equal(np.sign(pred_from_matr), np.sign(pred_early_stopping))

    # test that shape is checked during prediction
    bad_X_test = X_test[:, 1:]
    bad_shape_error_msg = "The number of features in data*"
83
84
85
86
87
88
89
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, bad_X_test)
    np.testing.assert_raises_regex(
        lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csr_matrix(bad_X_test)
    )
    np.testing.assert_raises_regex(
        lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, sparse.csc_matrix(bad_X_test)
    )
90
91
    with open(tname, "w+b") as f:
        dump_svmlight_file(bad_X_test, y_test, f)
92
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
93
94
    with open(tname, "w+b") as f:
        dump_svmlight_file(X_test, y_test, f, zero_based=False)
95
    np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg, bst.predict, tname)
96
97


98
99
100
101
102
103
104
105
106
107
108
109
110
class NumpySequence(lgb.Sequence):
    def __init__(self, ndarray, batch_size):
        self.ndarray = ndarray
        self.batch_size = batch_size

    def __getitem__(self, idx):
        # The simple implementation is just a single "return self.ndarray[idx]"
        # The following is for demo and testing purpose.
        if isinstance(idx, numbers.Integral):
            return self.ndarray[idx]
        elif isinstance(idx, slice):
            if not (idx.step is None or idx.step == 1):
                raise NotImplementedError("No need to implement, caller will not set step by now")
111
            return self.ndarray[idx.start : idx.stop]
112
113
        elif isinstance(idx, list):
            return self.ndarray[idx]
114
        else:
115
            raise TypeError(f"Sequence Index must be an integer/list/slice, got {type(idx).__name__}")
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

    def __len__(self):
        return len(self.ndarray)


def _create_sequence_from_ndarray(data, num_seq, batch_size):
    if num_seq == 1:
        return NumpySequence(data, batch_size)

    nrow = data.shape[0]
    seqs = []
    seq_size = nrow // num_seq
    for start in range(0, nrow, seq_size):
        end = min(start + seq_size, nrow)
        seq = NumpySequence(data[start:end], batch_size)
        seqs.append(seq)
    return seqs


135
136
137
138
@pytest.mark.parametrize("sample_count", [11, 100, None])
@pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize("num_seq", [1, 3])
139
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
140
    params = {"bin_construct_sample_cnt": sample_count}
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

    nrow = 50
    half_nrow = nrow // 2
    ncol = 11
    data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))

    if include_0_and_nan:
        # whole col
        data[:, 0] = 0
        data[:, 1] = np.nan

        # half col
        data[:half_nrow, 3] = 0
        data[:half_nrow, 2] = np.nan

        data[half_nrow:-2, 4] = 0
        data[:half_nrow, 4] = np.nan

    X = data[:, :-1]
    Y = data[:, -1]

162
163
    npy_bin_fname = tmpdir / "data_from_npy.bin"
    seq_bin_fname = tmpdir / "data_from_seq.bin"
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

    # Create dataset from numpy array directly.
    ds = lgb.Dataset(X, label=Y, params=params)
    ds.save_binary(npy_bin_fname)

    # Create dataset using Sequence.
    seqs = _create_sequence_from_ndarray(X, num_seq, batch_size)
    seq_ds = lgb.Dataset(seqs, label=Y, params=params)
    seq_ds.save_binary(seq_bin_fname)

    assert filecmp.cmp(npy_bin_fname, seq_bin_fname)

    # Test for validation set.
    # Select some random rows as valid data.
    rng = np.random.default_rng()  # Pass integer to set seed when needed.
179
    valid_idx = (rng.random(10) * nrow).astype(np.int32)
180
181
182
183
    valid_data = data[valid_idx, :]
    valid_X = valid_data[:, :-1]
    valid_Y = valid_data[:, -1]

184
185
186
    valid_npy_bin_fname = tmpdir / "valid_data_from_npy.bin"
    valid_seq_bin_fname = tmpdir / "valid_data_from_seq.bin"
    valid_seq2_bin_fname = tmpdir / "valid_data_from_seq2.bin"
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

    valid_ds = lgb.Dataset(valid_X, label=valid_Y, params=params, reference=ds)
    valid_ds.save_binary(valid_npy_bin_fname)

    # From Dataset constructor, with dataset from numpy array.
    valid_seqs = _create_sequence_from_ndarray(valid_X, num_seq, batch_size)
    valid_seq_ds = lgb.Dataset(valid_seqs, label=valid_Y, params=params, reference=ds)
    valid_seq_ds.save_binary(valid_seq_bin_fname)
    assert filecmp.cmp(valid_npy_bin_fname, valid_seq_bin_fname)

    # From Dataset.create_valid, with dataset from sequence.
    valid_seq_ds2 = seq_ds.create_valid(valid_seqs, label=valid_Y, params=params)
    valid_seq_ds2.save_binary(valid_seq2_bin_fname)
    assert filecmp.cmp(valid_npy_bin_fname, valid_seq2_bin_fname)


203
@pytest.mark.parametrize("num_seq", [1, 2])
204
def test_sequence_get_data(num_seq):
205
206
207
208
209
210
    nrow = 20
    ncol = 11
    data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))
    X = data[:, :-1]
    Y = data[:, -1]

211
212
213
    seqs = _create_sequence_from_ndarray(data=X, num_seq=num_seq, batch_size=6)
    seq_ds = lgb.Dataset(seqs, label=Y, params=None, free_raw_data=False).construct()
    assert seq_ds.get_data() == seqs
214

215
216
217
    used_indices = np.random.choice(np.arange(nrow), nrow // 3, replace=False)
    subset_data = seq_ds.subset(used_indices).construct()
    np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)])
218
219


220
def test_chunked_dataset():
221
222
223
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
224
225

    chunk_size = X_train.shape[0] // 10 + 1
226
227
    X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
    X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
228
229
230
231
232
233
234
235

    train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
    valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})
    train_data.construct()
    valid_data.construct()


def test_chunked_dataset_linear():
236
237
238
    X_train, X_test, y_train, y_test = train_test_split(
        *load_breast_cancer(return_X_y=True), test_size=0.1, random_state=2
    )
239
    chunk_size = X_train.shape[0] // 10 + 1
240
241
242
    X_train = [X_train[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
    X_test = [X_test[i * chunk_size : (i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]
    params = {"bin_construct_sample_cnt": 100, "linear_tree": True}
243
244
245
246
247
248
    train_data = lgb.Dataset(X_train, label=y_train, params=params)
    valid_data = train_data.create_valid(X_test, label=y_test, params=params)
    train_data.construct()
    valid_data.construct()


249
250
def test_save_dataset_subset_and_load_from_file(tmp_path):
    data = np.random.rand(100, 2)
251
    params = {"max_bin": 50, "min_data_in_bin": 10}
252
    ds = lgb.Dataset(data, params=params)
253
254
    ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
    lgb.Dataset(tmp_path / "subset.bin", params=params).construct()
255
256


257
def test_subset_group():
258
259
260
    rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
    X_train, y_train = load_svmlight_file(str(rank_example_dir / "rank.train"))
    q_train = np.loadtxt(str(rank_example_dir / "rank.train.query"))
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
    lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
    assert len(lgb_train.get_group()) == 201
    subset = lgb_train.subset(list(range(10))).construct()
    subset_group = subset.get_group()
    assert len(subset_group) == 2
    assert subset_group[0] == 1
    assert subset_group[1] == 9


def test_add_features_throws_if_num_data_unequal():
    X1 = np.random.random((100, 1))
    X2 = np.random.random((10, 1))
    d1 = lgb.Dataset(X1).construct()
    d2 = lgb.Dataset(X2).construct()
    with pytest.raises(lgb.basic.LightGBMError):
        d1.add_features_from(d2)


def test_add_features_throws_if_datasets_unconstructed():
    X1 = np.random.random((100, 1))
    X2 = np.random.random((100, 1))
    with pytest.raises(ValueError):
        d1 = lgb.Dataset(X1)
        d2 = lgb.Dataset(X2)
        d1.add_features_from(d2)
    with pytest.raises(ValueError):
287
        d1 = lgb.Dataset(X1).construct()
288
289
290
291
        d2 = lgb.Dataset(X2)
        d1.add_features_from(d2)
    with pytest.raises(ValueError):
        d1 = lgb.Dataset(X1)
292
        d2 = lgb.Dataset(X2).construct()
293
        d1.add_features_from(d2)
294
295


296
297
298
def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
    X = np.random.random((100, 5))
    X[:, [1, 3]] = 0
299
    names = [f"col_{i}" for i in range(5)]
300
301
302
303
    for j in range(1, 5):
        d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
        d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
        d1.add_features_from(d2)
304
        d1name = tmp_path / "d1.txt"
305
306
        d1._dump_text(d1name)
        d = lgb.Dataset(X, feature_name=names).construct()
307
        dname = tmp_path / "d.txt"
308
        d._dump_text(dname)
309
        with open(d1name, "rt") as d1f:
310
            d1txt = d1f.read()
311
        with open(dname, "rt") as df:
312
313
            dtxt = df.read()
        assert dtxt == d1txt
Guolin Ke's avatar
Guolin Ke committed
314

315
316
317
318

def test_add_features_same_booster_behaviour(tmp_path):
    X = np.random.random((100, 5))
    X[:, [1, 3]] = 0
319
    names = [f"col_{i}" for i in range(5)]
320
321
322
323
324
    for j in range(1, 5):
        d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
        d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
        d1.add_features_from(d2)
        d = lgb.Dataset(X, feature_name=names).construct()
325
        y = np.random.random(100)
326
327
328
329
        d1.set_label(y)
        d.set_label(y)
        b1 = lgb.Booster(train_set=d1)
        b = lgb.Booster(train_set=d)
330
        for _ in range(10):
331
332
            b.update()
            b1.update()
333
334
        dname = tmp_path / "d.txt"
        d1name = tmp_path / "d1.txt"
335
336
        b1.save_model(d1name)
        b.save_model(dname)
337
        with open(dname, "rt") as df:
338
            dtxt = df.read()
339
        with open(d1name, "rt") as d1f:
340
341
342
343
344
345
346
347
348
            d1txt = d1f.read()
        assert dtxt == d1txt


def test_add_features_from_different_sources():
    pd = pytest.importorskip("pandas")
    n_row = 100
    n_col = 5
    X = np.random.random((n_row, n_col))
349
    xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
350
    names = [f"col_{i}" for i in range(n_col)]
351
352
    seq = _create_sequence_from_ndarray(X, 1, 30)
    seq_ds = lgb.Dataset(seq, feature_name=names, free_raw_data=False).construct()
353
354
355
    npy_list_ds = lgb.Dataset(
        [X[: n_row // 2, :], X[n_row // 2 :, :]], feature_name=names, free_raw_data=False
    ).construct()
356
    immergeable_dds = [seq_ds, npy_list_ds]
357
358
359
360
361
362
363
364
365
    for x_1 in xxs:
        # test that method works even with free_raw_data=True
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
        d2 = lgb.Dataset(x_1, feature_name=names, free_raw_data=True).construct()
        d1.add_features_from(d2)
        assert d1.data is None

        # test that method works but sets raw data to None in case of immergeable data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
366
367
368
        for d2 in immergeable_dds:
            d1.add_features_from(d2)
            assert d1.data is None
369
370
371

        # test that method works for different data types
        d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
372
        res_feature_names = deepcopy(names)
373
374
375
376
377
378
        for idx, x_2 in enumerate(xxs, 2):
            original_type = type(d1.get_data())
            d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
            d1.add_features_from(d2)
            assert isinstance(d1.get_data(), original_type)
            assert d1.get_data().shape == (n_row, n_col * idx)
379
            res_feature_names += [f"D{idx}_{name}" for name in names]
380
381
382
            assert d1.feature_name == res_feature_names


383
384
385
386
387
388
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
    arr_a = np.zeros((100, 1), dtype=np.float32)
    arr_b = np.random.normal(size=(100, 5))

    dataset_a = lgb.Dataset(arr_a).construct()
    expected_msg = (
389
390
391
        "[LightGBM] [Warning] There are no meaningful features which satisfy "
        "the provided configuration. Decreasing Dataset parameters min_data_in_bin "
        "or min_data_in_leaf and re-constructing Dataset might resolve this warning.\n"
392
393
394
395
396
397
    )
    log_lines = capsys.readouterr().out
    assert expected_msg in log_lines

    dataset_b = lgb.Dataset(arr_b).construct()

398
    original_handle = dataset_a._handle.value
399
400
401
    dataset_a.add_features_from(dataset_b)
    assert dataset_a.num_feature() == 6
    assert dataset_a.num_data() == 100
402
    assert dataset_a._handle.value == original_handle
403
404


405
406
407
408
def test_cegb_affects_behavior(tmp_path):
    X = np.random.random((100, 5))
    X[:, [1, 3]] = 0
    y = np.random.random(100)
409
    names = [f"col_{i}" for i in range(5)]
410
411
412
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    base = lgb.Booster(train_set=ds)
413
    for _ in range(10):
414
        base.update()
415
    basename = tmp_path / "basename.txt"
416
    base.save_model(basename)
417
    with open(basename, "rt") as f:
418
419
        basetxt = f.read()
    # Set extremely harsh penalties, so CEGB will block most splits.
420
421
422
423
424
    cases = [
        {"cegb_penalty_feature_coupled": [50, 100, 10, 25, 30]},
        {"cegb_penalty_feature_lazy": [1, 2, 3, 4, 5]},
        {"cegb_penalty_split": 1},
    ]
425
426
    for case in cases:
        booster = lgb.Booster(train_set=ds, params=case)
427
        for _ in range(10):
428
            booster.update()
429
        casename = tmp_path / "casename.txt"
430
        booster.save_model(casename)
431
        with open(casename, "rt") as f:
432
433
434
435
436
437
438
439
            casetxt = f.read()
        assert basetxt != casetxt


def test_cegb_scaling_equalities(tmp_path):
    X = np.random.random((100, 5))
    X[:, [1, 3]] = 0
    y = np.random.random(100)
440
    names = [f"col_{i}" for i in range(5)]
441
442
443
    ds = lgb.Dataset(X, feature_name=names).construct()
    ds.set_label(y)
    # Compare pairs of penalties, to ensure scaling works as intended
444
445
446
447
448
449
450
451
452
453
454
455
    pairs = [
        (
            {"cegb_penalty_feature_coupled": [1, 2, 1, 2, 1]},
            {"cegb_penalty_feature_coupled": [0.5, 1, 0.5, 1, 0.5], "cegb_tradeoff": 2},
        ),
        (
            {"cegb_penalty_feature_lazy": [0.01, 0.02, 0.03, 0.04, 0.05]},
            {"cegb_penalty_feature_lazy": [0.005, 0.01, 0.015, 0.02, 0.025], "cegb_tradeoff": 2},
        ),
        ({"cegb_penalty_split": 1}, {"cegb_penalty_split": 2, "cegb_tradeoff": 0.5}),
    ]
    for p1, p2 in pairs:
456
457
        booster1 = lgb.Booster(train_set=ds, params=p1)
        booster2 = lgb.Booster(train_set=ds, params=p2)
458
        for _ in range(10):
459
460
            booster1.update()
            booster2.update()
461
        p1name = tmp_path / "p1.txt"
462
463
464
        # Reset booster1's parameters to p2, so the parameter section of the file matches.
        booster1.reset_parameter(p2)
        booster1.save_model(p1name)
465
        with open(p1name, "rt") as f:
466
            p1txt = f.read()
467
        p2name = tmp_path / "p2.txt"
468
        booster2.save_model(p2name)
469
        with open(p2name, "rt") as f:
470
471
472
473
474
475
476
            p2txt = f.read()
        assert p1txt == p2txt


def test_consistent_state_for_dataset_fields():
    def check_asserts(data):
        np.testing.assert_allclose(data.label, data.get_label())
477
        np.testing.assert_allclose(data.label, data.get_field("label"))
478
479
480
        assert not np.isnan(data.label[0])
        assert not np.isinf(data.label[1])
        np.testing.assert_allclose(data.weight, data.get_weight())
481
        np.testing.assert_allclose(data.weight, data.get_field("weight"))
482
483
484
        assert not np.isnan(data.weight[0])
        assert not np.isinf(data.weight[1])
        np.testing.assert_allclose(data.init_score, data.get_init_score())
485
        np.testing.assert_allclose(data.init_score, data.get_field("init_score"))
486
487
        assert not np.isnan(data.init_score[0])
        assert not np.isinf(data.init_score[1])
488
        assert np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]], data.label[0]))
489
490
491
492
493
494
495
        assert data.label[1] == pytest.approx(data.weight[1])
        assert data.feature_name == data.get_feature_name()

    X, y = load_breast_cancer(return_X_y=True)
    sequence = np.ones(y.shape[0])
    sequence[0] = np.nan
    sequence[1] = np.inf
496
497
    feature_names = [f"f{i}" for i in range(X.shape[1])]
    lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence, feature_name=feature_names).construct()
498
499
500
501
502
503
504
    check_asserts(lgb_data)
    lgb_data = lgb.Dataset(X, y).construct()
    lgb_data.set_label(sequence)
    lgb_data.set_weight(sequence)
    lgb_data.set_init_score(sequence)
    lgb_data.set_feature_name(feature_names)
    check_asserts(lgb_data)
505
506


507
508
509
510
def test_dataset_construction_overwrites_user_provided_metadata_fields():
    X = np.array([[1.0, 2.0], [3.0, 4.0]])

    position = np.array([0.0, 1.0], dtype=np.float32)
511
    if getenv("TASK", "") == "cuda":
512
513
514
515
        position = None

    dtrain = lgb.Dataset(
        X,
516
        params={"min_data_in_bin": 1, "min_data_in_leaf": 1, "verbosity": -1},
517
518
519
520
521
522
523
524
525
526
527
528
529
530
        group=[1, 1],
        init_score=[0.312, 0.708],
        label=[1, 2],
        position=position,
        weight=[0.5, 1.5],
    )

    # unconstructed, get_* methods should return whatever was provided
    assert dtrain.group == [1, 1]
    assert dtrain.get_group() == [1, 1]
    assert dtrain.init_score == [0.312, 0.708]
    assert dtrain.get_init_score() == [0.312, 0.708]
    assert dtrain.label == [1, 2]
    assert dtrain.get_label() == [1, 2]
531
532
533
    if getenv("TASK", "") != "cuda":
        np_assert_array_equal(dtrain.position, np.array([0.0, 1.0], dtype=np.float32), strict=True)
        np_assert_array_equal(dtrain.get_position(), np.array([0.0, 1.0], dtype=np.float32), strict=True)
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
    assert dtrain.weight == [0.5, 1.5]
    assert dtrain.get_weight() == [0.5, 1.5]

    # before construction, get_field() should raise an exception
    for field_name in ["group", "init_score", "label", "position", "weight"]:
        with pytest.raises(Exception, match=f"Cannot get {field_name} before construct Dataset"):
            dtrain.get_field(field_name)

    # constructed, get_* methods should return numpy arrays, even when the provided
    # input was a list of floats or ints
    dtrain.construct()
    expected_group = np.array([1, 1], dtype=np.int32)
    np_assert_array_equal(dtrain.group, expected_group, strict=True)
    np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
    # get_field("group") returns a numpy array with boundaries, instead of size
549
    np_assert_array_equal(dtrain.get_field("group"), np.array([0, 1, 2], dtype=np.int32), strict=True)
550

551
552
553
    expected_init_score = np.array(
        [0.312, 0.708],
    )
554
555
556
557
558
559
560
561
562
    np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
    np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
    np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)

    expected_label = np.array([1, 2], dtype=np.float32)
    np_assert_array_equal(dtrain.label, expected_label, strict=True)
    np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
    np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)

563
    if getenv("TASK", "") != "cuda":
564
565
566
567
        expected_position = np.array([0.0, 1.0], dtype=np.float32)
        np_assert_array_equal(dtrain.position, expected_position, strict=True)
        np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
        # NOTE: "position" is converted to int32 on the C++ side
568
        np_assert_array_equal(dtrain.get_field("position"), np.array([0.0, 1.0], dtype=np.int32), strict=True)
569
570
571
572
573
574
575

    expected_weight = np.array([0.5, 1.5], dtype=np.float32)
    np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
    np_assert_array_equal(dtrain.get_weight(), expected_weight, strict=True)
    np_assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)


576
577
578
579
580
581
582
583
584
585
def test_dataset_construction_with_high_cardinality_categorical_succeeds():
    pd = pytest.importorskip("pandas")
    X = pd.DataFrame({"x1": np.random.randint(0, 5_000, 10_000)})
    y = np.random.rand(10_000)
    ds = lgb.Dataset(X, y, categorical_feature=["x1"])
    ds.construct()
    assert ds.num_data() == 10_000
    assert ds.num_feature() == 1


586
587
588
589
590
def test_choose_param_value():
    original_params = {
        "local_listen_port": 1234,
        "port": 2222,
        "metric": "auc",
591
592
        "num_trees": 81,
        "n_iter": 13,
593
594
595
596
    }

    # should resolve duplicate aliases, and prefer the main parameter
    params = lgb.basic._choose_param_value(
597
        main_param_name="local_listen_port", params=original_params, default_value=5555
598
599
600
601
    )
    assert params["local_listen_port"] == 1234
    assert "port" not in params

602
603
    # should choose the highest priority alias and set that value on main param
    # if only aliases are used
604
    params = lgb.basic._choose_param_value(main_param_name="num_iterations", params=params, default_value=17)
605
    assert params["num_iterations"] == 13
606
    assert "num_trees" not in params
607
    assert "n_iter" not in params
608
609

    # should use the default if main param and aliases are missing
610
    params = lgb.basic._choose_param_value(main_param_name="learning_rate", params=params, default_value=0.789)
611
612
613
614
615
616
617
    assert params["learning_rate"] == 0.789

    # all changes should be made on copies and not modify the original
    expected_params = {
        "local_listen_port": 1234,
        "port": 2222,
        "metric": "auc",
618
619
        "num_trees": 81,
        "n_iter": 13,
620
621
    }
    assert original_params == expected_params
622
623


624
625
626
627
def test_choose_param_value_preserves_nones():
    # preserves None found for main param and still removes aliases
    params = lgb.basic._choose_param_value(
        main_param_name="num_threads",
628
629
        params={"num_threads": None, "n_jobs": 4, "objective": "regression"},
        default_value=2,
630
631
632
633
634
    )
    assert params == {"num_threads": None, "objective": "regression"}

    # correctly chooses value when only an alias is provided
    params = lgb.basic._choose_param_value(
635
        main_param_name="num_threads", params={"n_jobs": None, "objective": "regression"}, default_value=2
636
637
638
639
640
    )
    assert params == {"num_threads": None, "objective": "regression"}

    # adds None if that's given as the default and param not found
    params = lgb.basic._choose_param_value(
641
        main_param_name="min_data_in_leaf", params={"objective": "regression"}, default_value=None
642
643
644
645
    )
    assert params == {"objective": "regression", "min_data_in_leaf": None}


646
647
648
649
@pytest.mark.parametrize("objective_alias", lgb.basic._ConfigAliases.get("objective"))
def test_choose_param_value_objective(objective_alias):
    # If callable is found in objective
    params = {objective_alias: dummy_obj}
650
651
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=None)
    assert params["objective"] == dummy_obj
652
653
654

    # Value in params should be preferred to the default_value passed from keyword arguments
    params = {objective_alias: dummy_obj}
655
656
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
    assert params["objective"] == dummy_obj
657
658
659

    # None of objective or its aliases in params, but default_value is callable.
    params = {}
660
661
    params = lgb.basic._choose_param_value(main_param_name="objective", params=params, default_value=mse_obj)
    assert params["objective"] == mse_obj
662
663


664
665
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
666
667
def test_list_to_1d_numpy(collection, dtype):
    collection2y = {
668
669
670
671
672
673
        "1d_np": np.random.rand(10),
        "2d_np": np.random.rand(10, 1),
        "pd_float": np.random.rand(10),
        "pd_str": ["a", "b"],
        "1d_list": [1] * 10,
        "2d_list": [[1], [2]],
674
675
    }
    y = collection2y[collection]
676
    if collection.startswith("pd"):
677
        if not PANDAS_INSTALLED:
678
            pytest.skip("pandas is not installed")
679
680
        else:
            y = pd_Series(y)
681
    if isinstance(y, np.ndarray) and len(y.shape) == 2:
682
        with pytest.warns(UserWarning, match="column-vector"):
683
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
684
685
686
        return
    elif isinstance(y, list) and isinstance(y[0], list):
        with pytest.raises(TypeError):
687
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
688
689
690
        return
    elif isinstance(y, pd_Series) and y.dtype == object:
        with pytest.raises(ValueError):
691
            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
692
        return
693
    result = lgb.basic._list_to_1d_numpy(y, dtype=dtype, name="list")
694
695
    assert result.size == 10
    assert result.dtype == dtype
696
697


698
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
699
700
def test_init_score_for_multiclass_classification(init_score_type):
    init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
701
    if init_score_type == "array":
702
        init_score = np.array(init_score)
703
    elif init_score_type == "dataframe":
704
        if not PANDAS_INSTALLED:
705
            pytest.skip("Pandas is not installed.")
706
707
708
        init_score = pd_DataFrame(init_score)
    data = np.random.rand(10, 2)
    ds = lgb.Dataset(data, init_score=init_score).construct()
709
    np.testing.assert_equal(ds.get_field("init_score"), init_score)
710
    np.testing.assert_equal(ds.init_score, init_score)
711
712
713


def test_smoke_custom_parser(tmp_path):
714
715
716
    data_path = Path(__file__).absolute().parents[2] / "examples" / "binary_classification" / "binary.train"
    parser_config_file = tmp_path / "parser.ini"
    with open(parser_config_file, "w") as fout:
717
718
719
        fout.write('{"className": "dummy", "id": "1"}')

    data = lgb.Dataset(data_path, params={"parser_config_file": parser_config_file})
720
721
722
    with pytest.raises(
        lgb.basic.LightGBMError, match="Cannot find parser class 'dummy', please register first or check config format"
    ):
723
        data.construct()
724
725
726
727
728
729


def test_param_aliases():
    aliases = lgb.basic._ConfigAliases.aliases
    assert isinstance(aliases, dict)
    assert len(aliases) > 100
730
    assert all(isinstance(i, list) for i in aliases.values())
731
732
    assert all(len(i) >= 1 for i in aliases.values())
    assert all(k in v for k, v in aliases.items())
733
734
735
736
737
738
739
    assert lgb.basic._ConfigAliases.get("config", "task") == {"config", "config_file", "task", "task_type"}
    assert lgb.basic._ConfigAliases.get_sorted("min_data_in_leaf") == [
        "min_data_in_leaf",
        "min_data",
        "min_samples_leaf",
        "min_child_samples",
        "min_data_per_leaf",
740
    ]
741
742
743
744
745
746
747


def _bad_gradients(preds, _):
    return np.random.randn(len(preds) + 1), np.random.rand(len(preds) + 1)


def _good_gradients(preds, _):
748
    return np.random.randn(*preds.shape), np.random.rand(*preds.shape)
749
750
751
752
753
754
755
756
757
758
759


def test_custom_objective_safety():
    nrows = 100
    X = np.random.randn(nrows, 5)
    y_binary = np.arange(nrows) % 2
    classes = [0, 1, 2]
    nclass = len(classes)
    y_multiclass = np.arange(nrows) % nclass
    ds_binary = lgb.Dataset(X, y_binary).construct()
    ds_multiclass = lgb.Dataset(X, y_multiclass).construct()
760
761
762
763
    bad_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
    good_bst_binary = lgb.Booster({"objective": "none"}, ds_binary)
    bad_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
    good_bst_multi = lgb.Booster({"objective": "none", "num_class": nclass}, ds_multiclass)
764
765
766
767
768
769
    good_bst_binary.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape("number of models per one iteration (1)")):
        bad_bst_binary.update(fobj=_bad_gradients)
    good_bst_multi.update(fobj=_good_gradients)
    with pytest.raises(ValueError, match=re.escape(f"number of models per one iteration ({nclass})")):
        bad_bst_multi.update(fobj=_bad_gradients)
770
771


772
773
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
774
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
775
    pd = pytest.importorskip("pandas")
776
777
    X = np.random.rand(10, 2).astype(dtype)
    df = pd.DataFrame(X)
778
    built_data = lgb.basic._data_from_pandas(
779
        data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
780
    )[0]
781
782
    assert built_data.dtype == dtype
    assert np.shares_memory(X, built_data)
783
784


785
786
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize("categories", ["seen", "unseen"])
787
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
788
789
790
791
792
793
    pd = pytest.importorskip("pandas")
    X = np.random.choice(["a", "b"], 100).reshape(-1, 1)
    column_name = "a" if feature_name == "auto" else feature_name[0]
    df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
    if categories == "seen":
        pandas_categorical = [["a", "b"]]
794
    else:
795
        pandas_categorical = [["a"]]
796
797
798
799
    data = lgb.basic._data_from_pandas(
        data=df,
        feature_name=feature_name,
        categorical_feature="auto",
800
        pandas_categorical=pandas_categorical,
801
    )[0]
802
    # check that the original data wasn't modified
803
    np.testing.assert_equal(df[column_name], X[:, 0])
804
    # check that the built data has the codes
805
    if categories == "seen":
806
807
808
809
810
        # if all categories were seen during training we just take the codes
        codes = df[column_name].cat.codes
    else:
        # if we only saw 'a' during training we just replace its code
        # and leave the rest as nan
811
812
        a_code = df[column_name].cat.categories.get_loc("a")
        codes = np.where(df[column_name] == "a", a_code, np.nan)
813
    np.testing.assert_equal(codes, data[:, 0])
814
815


816
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
817
def test_feature_num_bin(min_data_in_bin):
818
819
820
821
822
823
824
825
826
827
    X = np.vstack(
        [
            np.random.rand(100),
            np.array([1, 2] * 50),
            np.array([0, 1, 2] * 33 + [0]),
            np.array([1, 2] * 49 + 2 * [np.nan]),
            np.zeros(100),
            np.random.choice([0, 1], 100),
        ]
    ).T
828
    n_continuous = X.shape[1] - 1
829
    feature_name = [f"x{i}" for i in range(n_continuous)] + ["cat1"]
830
    ds_kwargs = {
831
        "params": {"min_data_in_bin": min_data_in_bin},
832
833
        "categorical_feature": [n_continuous],  # last feature
    }
834
    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
835
836
837
838
839
840
    expected_num_bins = [
        100 // min_data_in_bin + 1,  # extra bin for zero
        3,  # 0, 1, 2
        3,  # 0, 1, 2
        4,  # 0, 1, 2 + nan
        0,  # unused
841
        3,  # 0, 1 + nan
842
843
844
    ]
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    assert actual_num_bins == expected_num_bins
845
846
847
848
849
    # test using defined feature names
    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
    assert bins_by_name == expected_num_bins
    # test using default feature names
    ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
850
    default_names = [f"Column_{i}" for i in range(X.shape[1])]
851
852
    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
    assert bins_by_default_name == expected_num_bins
853
854
855
856
857
    # check for feature indices outside of range
    num_features = X.shape[1]
    with pytest.raises(
        lgb.basic.LightGBMError,
        match=(
858
859
860
            f"Tried to retrieve number of bins for feature index {num_features}, "
            f"but the valid feature indices are \\[0, {num_features - 1}\\]."
        ),
861
862
    ):
        ds.feature_num_bin(num_features)
863
864
865
866
867


def test_feature_num_bin_with_max_bin_by_feature():
    X = np.random.rand(100, 3)
    max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
868
    ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
869
870
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
871
872
873
874
875


def test_set_leaf_output():
    X, y = load_breast_cancer(return_X_y=True)
    ds = lgb.Dataset(X, y)
876
    bst = lgb.Booster({"num_leaves": 2}, ds)
877
878
879
880
881
882
    bst.update()
    y_pred = bst.predict(X)
    for leaf_id in range(2):
        leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
        bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
    np.testing.assert_allclose(bst.predict(X), y_pred + 1)
883
884
885
886
887
888
889


def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
    ds = lgb.Dataset(
        data=np.random.randn(100, 3),
    )
    assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]