test_basic.py 9.29 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
wxchan's avatar
wxchan committed
2
# pylint: skip-file
wxchan's avatar
wxchan committed
3
4
5
6
7
import os
import tempfile
import unittest

import lightgbm as lgb
wxchan's avatar
wxchan committed
8
import numpy as np
9
from sklearn.datasets import load_breast_cancer, dump_svmlight_file, load_svmlight_file
wxchan's avatar
wxchan committed
10
from sklearn.model_selection import train_test_split
wxchan's avatar
wxchan committed
11

wxchan's avatar
wxchan committed
12

wxchan's avatar
wxchan committed
13
class TestBasic(unittest.TestCase):
wxchan's avatar
wxchan committed
14

wxchan's avatar
wxchan committed
15
    def test(self):
16
17
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True),
                                                            test_size=0.1, random_state=2)
18
        train_data = lgb.Dataset(X_train, label=y_train)
wxchan's avatar
wxchan committed
19
        valid_data = train_data.create_valid(X_test, label=y_test)
wxchan's avatar
wxchan committed
20

wxchan's avatar
wxchan committed
21
        params = {
wxchan's avatar
wxchan committed
22
23
            "objective": "binary",
            "metric": "auc",
Guolin Ke's avatar
Guolin Ke committed
24
            "min_data": 10,
wxchan's avatar
wxchan committed
25
            "num_leaves": 15,
26
            "verbose": -1,
27
28
            "num_threads": 1,
            "max_bin": 255
wxchan's avatar
wxchan committed
29
30
31
        }
        bst = lgb.Booster(params, train_data)
        bst.add_valid(valid_data, "valid_1")
wxchan's avatar
wxchan committed
32

wxchan's avatar
wxchan committed
33
34
35
36
        for i in range(30):
            bst.update()
            if i % 10 == 0:
                print(bst.eval_train(), bst.eval_valid())
37
38
39
40
41

        self.assertEqual(bst.current_iteration(), 30)
        self.assertEqual(bst.num_trees(), 30)
        self.assertEqual(bst.num_model_per_iteration(), 1)

wxchan's avatar
wxchan committed
42
43
44
        bst.save_model("model.txt")
        pred_from_matr = bst.predict(X_test)
        with tempfile.NamedTemporaryFile() as f:
45
46
            tname = f.name
        with open(tname, "w+b") as f:
Guolin Ke's avatar
Guolin Ke committed
47
            dump_svmlight_file(X_test, y_test, f)
48
49
        pred_from_file = bst.predict(tname)
        os.remove(tname)
wxchan's avatar
wxchan committed
50
51
        self.assertEqual(len(pred_from_matr), len(pred_from_file))
        for preds in zip(pred_from_matr, pred_from_file):
52
            self.assertAlmostEqual(*preds, places=15)
cbecker's avatar
cbecker committed
53

wxchan's avatar
wxchan committed
54
        # check saved model persistence
55
56
57
58
        bst = lgb.Booster(params, model_file="model.txt")
        pred_from_model_file = bst.predict(X_test)
        self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
        for preds in zip(pred_from_matr, pred_from_model_file):
59
            # we need to check the consistency of model file here, so test for exact equal
60
            self.assertEqual(*preds)
cbecker's avatar
cbecker committed
61
62

        # check early stopping is working. Make it stop very early, so the scores should be very close to zero
63
        pred_parameter = {"pred_early_stop": True, "pred_early_stop_freq": 5, "pred_early_stop_margin": 1.5}
64
        pred_early_stopping = bst.predict(X_test, **pred_parameter)
cbecker's avatar
cbecker committed
65
66
67
68
        self.assertEqual(len(pred_from_matr), len(pred_early_stopping))
        for preds in zip(pred_early_stopping, pred_from_matr):
            # scores likely to be different, but prediction should still be the same
            self.assertEqual(preds[0] > 0, preds[1] > 0)
69
70
71
72
73
74
75
76
77
78
79
80
81

    def test_chunked_dataset(self):
        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)

        chunk_size = X_train.shape[0] // 10 + 1
        X_train = [X_train[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_train.shape[0] // chunk_size + 1)]
        X_test = [X_test[i * chunk_size:(i + 1) * chunk_size, :] for i in range(X_test.shape[0] // chunk_size + 1)]

        train_data = lgb.Dataset(X_train, label=y_train, params={"bin_construct_sample_cnt": 100})
        valid_data = train_data.create_valid(X_test, label=y_test, params={"bin_construct_sample_cnt": 100})

        train_data.construct()
        valid_data.construct()
82
83

    def test_subset_group(self):
84
85
86
87
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                                           '../../examples/lambdarank/rank.train'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                          '../../examples/lambdarank/rank.train.query'))
88
89
90
91
92
93
94
        lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
        self.assertEqual(len(lgb_train.get_group()), 201)
        subset = lgb_train.subset(list(lgb.compat.range_(10))).construct()
        subset_group = subset.get_group()
        self.assertEqual(len(subset_group), 2)
        self.assertEqual(subset_group[0], 1)
        self.assertEqual(subset_group[1], 9)
95
96
97
98
99
100
101
102
103
104
105

    def test_add_features_throws_if_num_data_unequal(self):
        X1 = np.random.random((1000, 1))
        X2 = np.random.random((100, 1))
        d1 = lgb.Dataset(X1).construct()
        d2 = lgb.Dataset(X2).construct()
        with self.assertRaises(lgb.basic.LightGBMError):
            d1.add_features_from(d2)

    def test_add_features_throws_if_datasets_unconstructed(self):
        X1 = np.random.random((1000, 1))
106
        X2 = np.random.random((1000, 1))
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1)
            d2 = lgb.Dataset(X2)
            d1.add_features_from(d2)
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1).construct()
            d2 = lgb.Dataset(X2)
            d1.add_features_from(d2)
        with self.assertRaises(ValueError):
            d1 = lgb.Dataset(X1)
            d2 = lgb.Dataset(X2).construct()
            d1.add_features_from(d2)

    def test_add_features_equal_data_on_alternating_used_unused(self):
        X = np.random.random((1000, 5))
        X[:, [1, 3]] = 0
123
        names = ['col_%d' % i for i in range(5)]
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
        for j in range(1, 5):
            d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
            d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
            d1.add_features_from(d2)
            with tempfile.NamedTemporaryFile() as f:
                d1name = f.name
            d1.dump_text(d1name)
            d = lgb.Dataset(X, feature_name=names).construct()
            with tempfile.NamedTemporaryFile() as f:
                dname = f.name
            d.dump_text(dname)
            with open(d1name, 'rt') as d1f:
                d1txt = d1f.read()
            with open(dname, 'rt') as df:
                dtxt = df.read()
            os.remove(dname)
            os.remove(d1name)
            self.assertEqual(dtxt, d1txt)

    def test_add_features_same_booster_behaviour(self):
        X = np.random.random((1000, 5))
        X[:, [1, 3]] = 0
146
        names = ['col_%d' % i for i in range(5)]
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
        for j in range(1, 5):
            d1 = lgb.Dataset(X[:, :j], feature_name=names[:j]).construct()
            d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
            d1.add_features_from(d2)
            d = lgb.Dataset(X, feature_name=names).construct()
            y = np.random.random(1000)
            d1.set_label(y)
            d.set_label(y)
            b1 = lgb.Booster(train_set=d1)
            b = lgb.Booster(train_set=d)
            for k in range(10):
                b.update()
                b1.update()
            with tempfile.NamedTemporaryFile() as df:
                dname = df.name
            with tempfile.NamedTemporaryFile() as d1f:
                d1name = d1f.name
            b1.save_model(d1name)
            b.save_model(dname)
            with open(dname, 'rt') as df:
                dtxt = df.read()
            with open(d1name, 'rt') as d1f:
                d1txt = d1f.read()
            self.assertEqual(dtxt, d1txt)

172
    def test_get_feature_penalty_and_monotone_constraints(self):
173
        X = np.random.random((1000, 1))
174
175
176
177
        d = lgb.Dataset(X, params={'feature_penalty': [0.5],
                                   'monotone_constraints': [1]}).construct()
        np.testing.assert_almost_equal(d.get_feature_penalty(), [0.5])
        np.testing.assert_array_equal(d.get_monotone_constraints(), [1])
178
        d = lgb.Dataset(X).construct()
179
180
        self.assertIsNone(d.get_feature_penalty())
        self.assertIsNone(d.get_monotone_constraints())
181
182
183
184
185
186
187
188
189

    def test_add_features_feature_penalty(self):
        X = np.random.random((1000, 2))
        test_cases = [
            (None, None, None),
            ([0.5], None, [0.5, 1]),
            (None, [0.5], [1, 0.5]),
            ([0.5], [0.5], [0.5, 0.5])]
        for (p1, p2, expected) in test_cases:
190
            params1 = {'feature_penalty': p1} if p1 is not None else {}
191
            d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
192
            params2 = {'feature_penalty': p2} if p2 is not None else {}
193
194
195
            d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
            d1.add_features_from(d2)
            actual = d1.get_feature_penalty()
196
197
198
199
            if expected is None:
                self.assertIsNone(actual)
            else:
                np.testing.assert_almost_equal(actual, expected)
200
201
202
203
204
205
206
207
208

    def test_add_features_monotone_types(self):
        X = np.random.random((1000, 2))
        test_cases = [
            (None, None, None),
            ([1], None, [1, 0]),
            (None, [1], [0, 1]),
            ([1], [-1], [1, -1])]
        for (p1, p2, expected) in test_cases:
209
            params1 = {'monotone_constraints': p1} if p1 is not None else {}
210
            d1 = lgb.Dataset(X[:, 0].reshape((-1, 1)), params=params1).construct()
211
            params2 = {'monotone_constraints': p2} if p2 is not None else {}
212
213
214
            d2 = lgb.Dataset(X[:, 1].reshape((-1, 1)), params=params2).construct()
            d1.add_features_from(d2)
            actual = d1.get_monotone_constraints()
215
216
217
218
            if actual is None:
                self.assertIsNone(actual)
            else:
                np.testing.assert_array_equal(actual, expected)