test_engine.py 13.1 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
# coding: utf-8
wxchan's avatar
wxchan committed
2
# pylint: skip-file
wxchan's avatar
wxchan committed
3
4
5
6
7
import copy
import math
import os
import unittest

Guolin Ke's avatar
Guolin Ke committed
8
import lightgbm as lgb
wxchan's avatar
wxchan committed
9
10
import numpy as np
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
wxchan's avatar
wxchan committed
11
                              load_iris, load_svmlight_file)
wxchan's avatar
wxchan committed
12
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
wxchan's avatar
wxchan committed
13
from sklearn.model_selection import train_test_split, TimeSeriesSplit
wxchan's avatar
wxchan committed
14

wxchan's avatar
wxchan committed
15
16
17
18
19
20
try:
    import pandas as pd
    IS_PANDAS_INSTALLED = True
except ImportError:
    IS_PANDAS_INSTALLED = False

wxchan's avatar
wxchan committed
21
22
try:
    import cPickle as pickle
wxchan's avatar
wxchan committed
23
except ImportError:
wxchan's avatar
wxchan committed
24
    import pickle
wxchan's avatar
wxchan committed
25

wxchan's avatar
wxchan committed
26

wxchan's avatar
wxchan committed
27
28
29
def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])

wxchan's avatar
wxchan committed
30

wxchan's avatar
wxchan committed
31
class TestEngine(unittest.TestCase):
wxchan's avatar
wxchan committed
32
33

    def test_binary(self):
34
35
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
36
        params = {
wxchan's avatar
wxchan committed
37
            'objective': 'binary',
38
39
            'metric': 'binary_logloss',
            'verbose': -1
wxchan's avatar
wxchan committed
40
        }
41
42
43
44
45
46
47
48
49
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = log_loss(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
50
        self.assertLess(ret, 0.15)
51
        self.assertAlmostEqual(evals_result['valid_0']['binary_logloss'][-1], ret, places=5)
wxchan's avatar
wxchan committed
52

wxchan's avatar
wxchan committed
53
    def test_regreesion(self):
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {
            'metric': 'l2',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = mean_squared_error(y_test, gbm.predict(X_test))
69
        self.assertLess(ret, 16)
70
        self.assertAlmostEqual(evals_result['valid_0']['l2'][-1], ret, places=5)
wxchan's avatar
wxchan committed
71
72

    def test_multiclass(self):
73
74
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
75
        params = {
wxchan's avatar
wxchan committed
76
77
            'objective': 'multiclass',
            'metric': 'multi_logloss',
78
79
            'num_class': 10,
            'verbose': -1
wxchan's avatar
wxchan committed
80
        }
81
82
83
84
85
86
87
88
89
        lgb_train = lgb.Dataset(X_train, y_train, params=params)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=50,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result)
        ret = multi_logloss(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
90
        self.assertLess(ret, 0.2)
91
        self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
wxchan's avatar
wxchan committed
92

93
    def test_early_stopping(self):
94
        X, y = load_breast_cancer(True)
95
96
97
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
98
            'verbose': -1
99
        }
100
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
101
102
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
wxchan's avatar
wxchan committed
103
        valid_set_name = 'valid_set'
104
105
106
107
        # no early stopping
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
wxchan's avatar
wxchan committed
108
                        valid_names=valid_set_name,
109
110
111
                        verbose_eval=False,
                        early_stopping_rounds=5)
        self.assertEqual(gbm.best_iteration, -1)
wxchan's avatar
wxchan committed
112
113
        self.assertIn(valid_set_name, gbm.best_score)
        self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
114
115
116
        # early stopping occurs
        gbm = lgb.train(params, lgb_train,
                        valid_sets=lgb_eval,
wxchan's avatar
wxchan committed
117
                        valid_names=valid_set_name,
118
119
120
                        verbose_eval=False,
                        early_stopping_rounds=5)
        self.assertLessEqual(gbm.best_iteration, 100)
wxchan's avatar
wxchan committed
121
122
        self.assertIn(valid_set_name, gbm.best_score)
        self.assertIn('binary_logloss', gbm.best_score[valid_set_name])
123

124
125
126
    def test_continue_train_and_dump_model(self):
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
127
        params = {
wxchan's avatar
wxchan committed
128
            'objective': 'regression',
129
130
            'metric': 'l1',
            'verbose': -1
wxchan's avatar
wxchan committed
131
        }
132
133
134
        lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)
        init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
wxchan's avatar
wxchan committed
135
        model_name = 'model.txt'
136
137
138
139
140
141
142
143
144
145
146
        init_gbm.save_model(model_name)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=30,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        # test custom eval metrics
                        feval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)),
                        evals_result=evals_result,
                        init_model='model.txt')
        ret = mean_absolute_error(y_test, gbm.predict(X_test))
Guolin Ke's avatar
Guolin Ke committed
147
        self.assertLess(ret, 3.5)
148
149
        self.assertAlmostEqual(evals_result['valid_0']['l1'][-1], ret, places=5)
        for l1, mae in zip(evals_result['valid_0']['l1'], evals_result['valid_0']['mae']):
wxchan's avatar
wxchan committed
150
            self.assertAlmostEqual(l1, mae, places=5)
151
        # test dump model
wxchan's avatar
wxchan committed
152
153
154
155
156
        self.assertIn('tree_info', gbm.dump_model())
        self.assertIsInstance(gbm.feature_importance(), np.ndarray)
        os.remove(model_name)

    def test_continue_train_multiclass(self):
157
158
        X, y = load_iris(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
159
        params = {
wxchan's avatar
wxchan committed
160
161
            'objective': 'multiclass',
            'metric': 'multi_logloss',
162
163
            'num_class': 3,
            'verbose': -1
wxchan's avatar
wxchan committed
164
        }
165
166
167
168
169
170
171
172
173
174
175
        lgb_train = lgb.Dataset(X_train, y_train, params=params, free_raw_data=False)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params, free_raw_data=False)
        init_gbm = lgb.train(params, lgb_train, num_boost_round=20)
        evals_result = {}
        gbm = lgb.train(params, lgb_train,
                        num_boost_round=30,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        evals_result=evals_result,
                        init_model=init_gbm)
        ret = multi_logloss(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
176
        self.assertLess(ret, 1.5)
177
        self.assertAlmostEqual(evals_result['valid_0']['multi_logloss'][-1], ret, places=5)
wxchan's avatar
wxchan committed
178
179

    def test_cv(self):
180
181
182
183
184
185
186
        X, y = load_boston(True)
        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {'verbose': -1}
        lgb_train = lgb.Dataset(X_train, y_train)
        # shuffle = False, override metric in params
        params_with_metric = {'metric': 'l2', 'verbose': -1}
        lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, shuffle=False,
wxchan's avatar
wxchan committed
187
188
               metrics='l1', verbose_eval=False)
        # shuffle = True, callbacks
189
        lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, shuffle=True,
190
               metrics='l1', verbose_eval=False,
191
               callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
wxchan's avatar
wxchan committed
192
        # self defined data_splitter
wxchan's avatar
wxchan committed
193
        tss = TimeSeriesSplit(3)
194
        lgb.cv(params, lgb_train, num_boost_round=10, data_splitter=tss, nfold=5,  # test if wrong nfold is ignored
wxchan's avatar
wxchan committed
195
               metrics='l2', verbose_eval=False)
wxchan's avatar
wxchan committed
196
197
198
        # lambdarank
        X_train, y_train = load_svmlight_file('../../examples/lambdarank/rank.train')
        q_train = np.loadtxt('../../examples/lambdarank/rank.train.query')
199
200
201
        params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
        lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
        lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, metrics='l2', verbose_eval=False)
wxchan's avatar
wxchan committed
202

wxchan's avatar
wxchan committed
203
    def test_feature_name(self):
204
205
206
207
        X, y = load_boston(True)
        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
        params = {'verbose': -1}
        lgb_train = lgb.Dataset(X_train, y_train)
208
        feature_names = ['f_' + str(i) for i in range(13)]
209
        gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names)
210
211
212
        self.assertListEqual(feature_names, gbm.feature_name())
        # test feature_names with whitespaces
        feature_names_with_space = ['f ' + str(i) for i in range(13)]
213
        gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space)
wxchan's avatar
wxchan committed
214
215
        self.assertListEqual(feature_names, gbm.feature_name())

wxchan's avatar
wxchan committed
216
    def test_save_load_copy_pickle(self):
217
218
219
220
221
222
223
224
225
226
227
228
229
        def test_template(init_model=None, return_model=False):
            X, y = load_boston(True)
            params = {
                'objective': 'regression',
                'metric': 'l2',
                'verbose': -1
            }
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
            lgb_train = lgb.Dataset(X_train, y_train)
            gbm_template = lgb.train(params, lgb_train, num_boost_round=10, init_model=init_model)
            return gbm_template if return_model else mean_squared_error(y_test, gbm_template.predict(X_test))
        gbm = test_template(return_model=True)
        ret_origin = test_template(init_model=gbm)
wxchan's avatar
wxchan committed
230
231
        other_ret = []
        gbm.save_model('lgb.model')
232
        other_ret.append(test_template(init_model='lgb.model'))
wxchan's avatar
wxchan committed
233
        gbm_load = lgb.Booster(model_file='lgb.model')
234
235
236
        other_ret.append(test_template(init_model=gbm_load))
        other_ret.append(test_template(init_model=copy.copy(gbm)))
        other_ret.append(test_template(init_model=copy.deepcopy(gbm)))
wxchan's avatar
wxchan committed
237
238
239
240
        with open('lgb.pkl', 'wb') as f:
            pickle.dump(gbm, f)
        with open('lgb.pkl', 'rb') as f:
            gbm_pickle = pickle.load(f)
241
        other_ret.append(test_template(init_model=gbm_pickle))
wxchan's avatar
wxchan committed
242
        gbm_pickles = pickle.loads(pickle.dumps(gbm))
243
        other_ret.append(test_template(init_model=gbm_pickles))
wxchan's avatar
wxchan committed
244
245
        for ret in other_ret:
            self.assertAlmostEqual(ret_origin, ret, places=5)
wxchan's avatar
wxchan committed
246

247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
    @unittest.skipIf(not IS_PANDAS_INSTALLED, 'pandas not installed')
    def test_pandas_categorical(self):
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
                          "D": np.random.permutation([True, False] * 150)})  # bool
        y = np.random.permutation([0, 1] * 150)
        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
                               "D": np.random.permutation([True, False] * 30)})
        for col in ["A", "B", "C", "D"]:
            X[col] = X[col].astype('category')
            X_test[col] = X_test[col].astype('category')
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X, y)
        gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
        pred0 = list(gbm0.predict(X_test))
        lgb_train = lgb.Dataset(X, y)
        gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
                         categorical_feature=[0])
        pred1 = list(gbm1.predict(X_test))
        lgb_train = lgb.Dataset(X, y)
        gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
                         categorical_feature=['A'])
        pred2 = list(gbm2.predict(X_test))
        lgb_train = lgb.Dataset(X, y)
        gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
                         categorical_feature=['A', 'B', 'C', 'D'])
        pred3 = list(gbm3.predict(X_test))
        lgb_train = lgb.Dataset(X, y)
        gbm3.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
        pred4 = list(gbm4.predict(X_test))
285
286
287
288
        np.testing.assert_almost_equal(pred0, pred1)
        np.testing.assert_almost_equal(pred0, pred2)
        np.testing.assert_almost_equal(pred0, pred3)
        np.testing.assert_almost_equal(pred0, pred4)
289

wxchan's avatar
wxchan committed
290

wxchan's avatar
wxchan committed
291
292
293
print("----------------------------------------------------------------------")
print("running test_engine.py")
unittest.main()