test_basic.py 14.8 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
wxchan's avatar
wxchan committed
2
3
4
5
6
import os
import tempfile
import unittest

import lightgbm as lgb
wxchan's avatar
wxchan committed
7
import numpy as np
8
9

from scipy import sparse
10
from sklearn.datasets import load_breast_cancer, dump_svmlight_file, load_svmlight_file
wxchan's avatar
wxchan committed
11
from sklearn.model_selection import train_test_split
wxchan's avatar
wxchan committed
12

wxchan's avatar
wxchan committed
13

wxchan's avatar
wxchan committed
14
class TestBasic(unittest.TestCase):
wxchan's avatar
wxchan committed
15

wxchan's avatar
wxchan committed
16
    def test(self):
17
18
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True),
                                                            test_size=0.1, random_state=2)
19
        train_data = lgb.Dataset(X_train, label=y_train)
wxchan's avatar
wxchan committed
20
        valid_data = train_data.create_valid(X_test, label=y_test)
wxchan's avatar
wxchan committed
21

wxchan's avatar
wxchan committed
22
        params = {
wxchan's avatar
wxchan committed
23
24
            "objective": "binary",
            "metric": "auc",
Guolin Ke's avatar
Guolin Ke committed
25
            "min_data": 10,
wxchan's avatar
wxchan committed
26
            "num_leaves": 15,
27
            "verbose": -1,
28
29
            "num_threads": 1,
            "max_bin": 255
wxchan's avatar
wxchan committed
30
31
32
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")
wxchan's avatar
wxchan committed
33

wxchan's avatar
wxchan committed
34
35
36
37
        for i in range(30):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())
38
39
40
41
42

        self.assertEqual(bst.current_iteration(), 30)
        self.assertEqual(bst.num_trees(), 30)
        self.assertEqual(bst.num_model_per_iteration(), 1)

wxchan's avatar
wxchan committed
43
44
45
        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
46
47
            tname = f.name
        with open(tname, "w+b") as f:
Guolin Ke's avatar
Guolin Ke committed
48
            dump_svmlight_file(X_test, y_test, f)
49
50
        pred_from_file = bst.predict(tname)
        os.remove(tname)
wxchan's avatar
wxchan committed
51
52
        self.assertEqual(len(pred_from_matr), len(pred_from_file))
        for preds in zip(pred_from_matr, pred_from_file):
53
            self.assertAlmostEqual(*preds, places=15)
cbecker's avatar
cbecker committed
54

wxchan's avatar
wxchan committed
55
        # check saved model persistence
56
        bst = lgb.Booster(params, model_file="model.txt")
57
        os.remove("model.txt")
58
59
60
        pred_from_model_file = bst.predict(X_test)
        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
        for preds in zip(pred_from_matr, pred_from_model_file):
61
            # we need to check the consistency of model file here, so test for exact equal
62
            self.assertEqual(*preds)
cbecker's avatar
cbecker committed
63
64

        # check early stopping is working. Make it stop very early, so the scores should be very close to zero
65
        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
66
        pred_early_stopping = bst.predict(X_test, **pred_parameter)
cbecker's avatar
cbecker committed
67
68
69
70
        self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
        for preds in zip(pred_early_stopping, pred_from_matr):
            # scores likely to be different, but prediction should still be the same
            self.assertEqual(preds[0] > 0, preds[1] > 0)
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        # test that shape is checked during prediction
        bad_X_test = X_test[:, 1:]
        bad_shape_error_msg = "The number of features in data*"
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, bad_X_test)
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, sparse.csr_matrix(bad_X_test))
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, sparse.csc_matrix(bad_X_test))
        with open(tname, "w+b") as f:
            dump_svmlight_file(bad_X_test, y_test, f)
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, tname)
        with open(tname, "w+b") as f:
            dump_svmlight_file(X_test, y_test, f, zero_based=False)
        np.testing.assert_raises_regex(lgb.basic.LightGBMError, bad_shape_error_msg,
                                       bst.predict, tname)
        os.remove(tname)

91
92
93
94
95
96
97
98
99
100
101
102
    def test_chunked_dataset(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)

        chunk_size = X_train.shape[0] // 10 + 1
        X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
        X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]

        train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
        valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})

        train_data.construct()
        valid_data.construct()
103
104

    def test_subset_group(self):
105
106
107
108
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                                           '../../examples/lambdarank/rank.train'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                          '../../examples/lambdarank/rank.train.query'))
109
110
111
112
113
114
115
        lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
        self.assertEqual(len(lgb_train.get_group()), 201)
        subset = lgb_train.subset(list(lgb.compat.range_(10))).construct()
        subset_group = subset.get_group()
        self.assertEqual(len(subset_group), 2)
        self.assertEqual(subset_group[0], 1)
        self.assertEqual(subset_group[1], 9)
116
117
118
119
120
121
122
123
124
125
126

    def test_add_features_throws_if_num_data_unequal(self):
        X1 = np.random.random((1000, 1))
        X2 = np.random.random((100, 1))
        d1 = lgb.Dataset(X1).construct()
        d2 = lgb.Dataset(X2).construct()
        with self.assertRaises(lgb.basic.LightGBMError):
            d1.add_features_from(d2)

    def test_add_features_throws_if_datasets_unconstructed(self):
        X1 = np.random.random((1000, 1))
127
        X2 = np.random.random((1000, 1))
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1)
            d2 = lgb.Dataset(X2)
            d1.add_features_from(d2)
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1).construct()
            d2 = lgb.Dataset(X2)
            d1.add_features_from(d2)
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1)
            d2 = lgb.Dataset(X2).construct()
            d1.add_features_from(d2)

    def test_add_features_equal_data_on_alternating_used_unused(self):
        X = np.random.random((1000, 5))
        X[:, [1, 3]] = 0
144
        names = ['col_%d' % i for i in range(5)]
145
146
147
148
149
150
        for j in range(1, 5):
            d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
            d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
            d1.add_features_from(d2)
            with tempfile.NamedTemporaryFile() as f:
                d1name = f.name
151
            d1._dump_text(d1name)
152
153
154
            d = lgb.Dataset(X, feature_name=names).construct()
            with tempfile.NamedTemporaryFile() as f:
                dname = f.name
155
            d._dump_text(dname)
156
157
158
159
160
161
162
163
164
165
166
            with open(d1name, 'rt') as d1f:
                d1txt = d1f.read()
            with open(dname, 'rt') as df:
                dtxt = df.read()
            os.remove(dname)
            os.remove(d1name)
            self.assertEqual(dtxt, d1txt)

    def test_add_features_same_booster_behaviour(self):
        X = np.random.random((1000, 5))
        X[:, [1, 3]] = 0
167
        names = ['col_%d' % i for i in range(5)]
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
        for j in range(1, 5):
            d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
            d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
            d1.add_features_from(d2)
            d = lgb.Dataset(X, feature_name=names).construct()
            y = np.random.random(1000)
            d1.set_label(y)
            d.set_label(y)
            b1 = lgb.Booster(train_set=d1)
            b = lgb.Booster(train_set=d)
            for k in range(10):
                b.update()
                b1.update()
            with tempfile.NamedTemporaryFile() as df:
                dname = df.name
            with tempfile.NamedTemporaryFile() as d1f:
                d1name = d1f.name
            b1.save_model(d1name)
            b.save_model(dname)
            with open(dname, 'rt') as df:
                dtxt = df.read()
            with open(d1name, 'rt') as d1f:
                d1txt = d1f.read()
            self.assertEqual(dtxt, d1txt)

193
    def test_get_feature_penalty_and_monotone_constraints(self):
194
        X = np.random.random((1000, 1))
195
196
        d = lgb.Dataset(X, params={'feature_penalty': [0.5],
                                   'monotone_constraints': [1]}).construct()
197
        np.testing.assert_allclose(d.get_feature_penalty(), [0.5])
198
        np.testing.assert_array_equal(d.get_monotone_constraints(), [1])
199
        d = lgb.Dataset(X).construct()
200
201
        self.assertIsNone(d.get_feature_penalty())
        self.assertIsNone(d.get_monotone_constraints())
202
203
204
205
206
207
208
209
210

    def test_add_features_feature_penalty(self):
        X = np.random.random((1000, 2))
        test_cases = [
            (None, None, None),
            ([0.5], None, [0.5, 1]),
            (None, [0.5], [1, 0.5]),
            ([0.5], [0.5], [0.5, 0.5])]
        for (p1, p2, expected) in test_cases:
211
            params1 = {'feature_penalty': p1} if p1 is not None else {}
212
            d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
213
            params2 = {'feature_penalty': p2} if p2 is not None else {}
214
215
216
            d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
            d1.add_features_from(d2)
            actual = d1.get_feature_penalty()
217
218
219
            if expected is None:
                self.assertIsNone(actual)
            else:
220
                np.testing.assert_allclose(actual, expected)
221
222
223
224
225
226
227
228
229

    def test_add_features_monotone_types(self):
        X = np.random.random((1000, 2))
        test_cases = [
            (None, None, None),
            ([1], None, [1, 0]),
            (None, [1], [0, 1]),
            ([1], [-1], [1, -1])]
        for (p1, p2, expected) in test_cases:
230
            params1 = {'monotone_constraints': p1} if p1 is not None else {}
231
            d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
232
            params2 = {'monotone_constraints': p2} if p2 is not None else {}
233
234
235
            d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
            d1.add_features_from(d2)
            actual = d1.get_monotone_constraints()
236
237
238
239
            if actual is None:
                self.assertIsNone(actual)
            else:
                np.testing.assert_array_equal(actual, expected)
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304

    def test_cegb_affects_behavior(self):
        X = np.random.random((1000, 5))
        X[:, [1, 3]] = 0
        y = np.random.random(1000)
        names = ['col_%d' % i for i in range(5)]
        ds = lgb.Dataset(X, feature_name=names).construct()
        ds.set_label(y)
        base = lgb.Booster(train_set=ds)
        for k in range(10):
            base.update()
        with tempfile.NamedTemporaryFile() as f:
            basename = f.name
        base.save_model(basename)
        with open(basename, 'rt') as f:
            basetxt = f.read()
        # Set extremely harsh penalties, so CEGB will block most splits.
        cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
                 {'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
                 {'cegb_penalty_split': 1}]
        for case in cases:
            booster = lgb.Booster(train_set=ds, params=case)
            for k in range(10):
                booster.update()
            with tempfile.NamedTemporaryFile() as f:
                casename = f.name
            booster.save_model(casename)
            with open(casename, 'rt') as f:
                casetxt = f.read()
            self.assertNotEqual(basetxt, casetxt)

    def test_cegb_scaling_equalities(self):
        X = np.random.random((1000, 5))
        X[:, [1, 3]] = 0
        y = np.random.random(1000)
        names = ['col_%d' % i for i in range(5)]
        ds = lgb.Dataset(X, feature_name=names).construct()
        ds.set_label(y)
        # Compare pairs of penalties, to ensure scaling works as intended
        pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
                  {'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
                 ({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
                  {'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
                 ({'cegb_penalty_split': 1},
                  {'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
        for (p1, p2) in pairs:
            booster1 = lgb.Booster(train_set=ds, params=p1)
            booster2 = lgb.Booster(train_set=ds, params=p2)
            for k in range(10):
                booster1.update()
                booster2.update()
            with tempfile.NamedTemporaryFile() as f:
                p1name = f.name
            # Reset booster1's parameters to p2, so the parameter section of the file matches.
            booster1.reset_parameter(p2)
            booster1.save_model(p1name)
            with open(p1name, 'rt') as f:
                p1txt = f.read()
            with tempfile.NamedTemporaryFile() as f:
                p2name = f.name
            booster2.save_model(p2name)
            self.maxDiff = None
            with open(p2name, 'rt') as f:
                p2txt = f.read()
            self.assertEqual(p1txt, p2txt)
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

    def test_consistent_state_for_dataset_fields(self):

        def check_asserts(data):
            np.testing.assert_allclose(data.label, data.get_label())
            np.testing.assert_allclose(data.label, data.get_field('label'))
            self.assertFalse(np.isnan(data.label[0]))
            self.assertFalse(np.isinf(data.label[1]))
            np.testing.assert_allclose(data.weight, data.get_weight())
            np.testing.assert_allclose(data.weight, data.get_field('weight'))
            self.assertFalse(np.isnan(data.weight[0]))
            self.assertFalse(np.isinf(data.weight[1]))
            np.testing.assert_allclose(data.init_score, data.get_init_score())
            np.testing.assert_allclose(data.init_score, data.get_field('init_score'))
            self.assertFalse(np.isnan(data.init_score[0]))
            self.assertFalse(np.isinf(data.init_score[1]))
            self.assertTrue(np.all(np.isclose([data.label[0], data.weight[0], data.init_score[0]],
                                              data.label[0])))
            self.assertAlmostEqual(data.label[1], data.weight[1])

        X, y = load_breast_cancer(True)
        sequence = np.ones(y.shape[0])
        sequence[0] = np.nan
        sequence[1] = np.inf
        lgb_data = lgb.Dataset(X, sequence, weight=sequence, init_score=sequence).construct()
        check_asserts(lgb_data)
        lgb_data = lgb.Dataset(X, y).construct()
        lgb_data.set_label(sequence)
        lgb_data.set_weight(sequence)
        lgb_data.set_init_score(sequence)
        check_asserts(lgb_data)