test_sklearn.py 14.5 KB
Newer Older
wxchan's avatar
wxchan committed
1
2
# coding: utf-8
# pylint: skip-file
3
import math
4
import os
wxchan's avatar
wxchan committed
5
6
import unittest

Guolin Ke's avatar
Guolin Ke committed
7
import lightgbm as lgb
wxchan's avatar
wxchan committed
8
import numpy as np
wxchan's avatar
wxchan committed
9
from sklearn.base import clone
wxchan's avatar
wxchan committed
10
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
11
                              load_iris, load_svmlight_file)
wxchan's avatar
wxchan committed
12
from sklearn.externals import joblib
wxchan's avatar
wxchan committed
13
14
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
15
from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest,
16
17
18
19
20
21
                                            check_parameters_default_constructible)
try:
    from sklearn.utils.estimator_checks import check_no_fit_attributes_set_in_init
    sklearn_at_least_019 = True
except ImportError:
    sklearn_at_least_019 = False
wxchan's avatar
wxchan committed
22

wxchan's avatar
wxchan committed
23

24
25
26
27
28
29
def multi_error(y_true, y_pred):
    return np.mean(y_true != y_pred)


def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
wxchan's avatar
wxchan committed
30

wxchan's avatar
wxchan committed
31
32
33
34

class TestSklearn(unittest.TestCase):

    def test_binary(self):
35
36
37
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
38
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
39
        ret = log_loss(y_test, gbm.predict_proba(X_test))
wxchan's avatar
wxchan committed
40
        self.assertLess(ret, 0.15)
41
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
42

43
    def test_regression(self):
44
45
46
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
47
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
48
49
        ret = mean_squared_error(y_test, gbm.predict(X_test))
        self.assertLess(ret, 16)
50
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
51

wxchan's avatar
wxchan committed
52
    def test_multiclass(self):
53
54
55
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
56
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
57
        ret = multi_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
58
        self.assertLess(ret, 0.2)
59
        ret = multi_logloss(y_test, gbm.predict_proba(X_test))
60
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
61

wxchan's avatar
wxchan committed
62
    def test_lambdarank(self):
63
64
65
66
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
        X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
        q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
67
68
        gbm = lgb.LGBMRanker()
        gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
69
                eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
70
                callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
wxchan's avatar
wxchan committed
71
72
73
74
75
76

    def test_regression_with_custom_objective(self):
        def objective_ls(y_true, y_pred):
            grad = (y_pred - y_true)
            hess = np.ones(len(y_true))
            return grad, hess
77
78
79
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
80
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
81
        ret = mean_squared_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
82
        self.assertLess(ret, 100)
83
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
84
85
86
87
88
89
90

    def test_binary_classification_with_custom_objective(self):
        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess
91
        X, y = load_digits(2, True)
wxchan's avatar
wxchan committed
92

wxchan's avatar
wxchan committed
93
94
        def binary_error(y_test, y_pred):
            return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
95
96
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
97
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
98
        ret = binary_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
99
100
        self.assertLess(ret, 0.1)

101
    def test_dart(self):
102
103
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
104
105
106
107
        gbm = lgb.LGBMRegressor(boosting_type='dart')
        gbm.fit(X_train, y_train)
        self.assertLessEqual(gbm.score(X_train, y_train), 1.)

wxchan's avatar
wxchan committed
108
    def test_grid_search(self):
109
110
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
111
        params = {'boosting_type': ['dart', 'gbdt'],
wxchan's avatar
wxchan committed
112
113
                  'n_estimators': [5, 8],
                  'drop_rate': [0.05, 0.1]}
114
        gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
wxchan's avatar
wxchan committed
115
        gbm.fit(X_train, y_train)
wxchan's avatar
wxchan committed
116
        self.assertIn(gbm.best_params_['n_estimators'], [5, 8])
wxchan's avatar
wxchan committed
117

118
    def test_clone_and_property(self):
119
120
121
122
123
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
124
        gbm_clone = clone(gbm)
125
        self.assertIsInstance(gbm.booster_, lgb.Booster)
126
        self.assertIsInstance(gbm.feature_importances_, np.ndarray)
127
128
129
130
131

        X, y = load_digits(2, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        clf = lgb.LGBMClassifier()
        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
132
133
134
        self.assertListEqual(sorted(clf.classes_), [0, 1])
        self.assertEqual(clf.n_classes_, 2)
        self.assertIsInstance(clf.booster_, lgb.Booster)
135
        self.assertIsInstance(clf.feature_importances_, np.ndarray)
wxchan's avatar
wxchan committed
136

wxchan's avatar
wxchan committed
137
    def test_joblib(self):
138
139
140
141
142
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
143
144
        joblib.dump(gbm, 'lgb.pkl')
        gbm_pickle = joblib.load('lgb.pkl')
145
        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
wxchan's avatar
wxchan committed
146
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
147
        self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
148
149
150

        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
151
152
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
wxchan's avatar
wxchan committed
153
154
155
        for key in gbm.evals_result_:
            for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
                self.assertAlmostEqual(*evals, places=5)
wxchan's avatar
wxchan committed
156
157
158
159
160
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
        self.assertEqual(len(pred_origin), len(pred_pickle))
        for preds in zip(pred_origin, pred_pickle):
            self.assertAlmostEqual(*preds, places=5)
161
162
163
164
165
166
167

    def test_feature_importances_single_leaf(self):
        clf = lgb.LGBMClassifier(n_estimators=100)
        data = load_iris()
        clf.fit(data.data, data.target)
        importances = clf.feature_importances_
        self.assertEqual(len(importances), 4)
168

169
170
171
172
173
174
175
176
177
178
179
180
181
    def test_feature_importances_type(self):
        clf = lgb.LGBMClassifier(n_estimators=100)
        data = load_iris()
        clf.fit(data.data, data.target)
        clf.set_params(importance_type='split')
        importances_split = clf.feature_importances_
        clf.set_params(importance_type='gain')
        importances_gain = clf.feature_importances_
        # Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
        importance_split_top1 = sorted(importances_split, reverse=True)[0]
        importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
        self.assertNotEqual(importance_split_top1, importance_gain_top1)

182
183
184
185
186
187
188
189
190
191
192
    def test_sklearn_backward_compatibility(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

        # Tests that `seed` is the same as `random_state`
        clf_1 = lgb.sklearn.LGBMClassifier(seed=42, subsample=0.6, colsample_bytree=0.8)
        clf_2 = lgb.sklearn.LGBMClassifier(random_state=42, subsample=0.6, colsample_bytree=0.8)
        y_pred_1 = clf_1.fit(X_train, y_train).predict_proba(X_test)
        y_pred_2 = clf_2.fit(X_train, y_train).predict_proba(X_test)
        np.testing.assert_allclose(y_pred_1, y_pred_2)

193
194
    # sklearn <0.19 cannot accept instance, but many tests could be passed only with min_data=1 and min_data_in_bin=1
    @unittest.skipIf(not sklearn_at_least_019, 'scikit-learn version is less than 0.19')
195
    def test_sklearn_integration(self):
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
        # we cannot use `check_estimator` directly since there is no skip test mechanism
        for name, estimator in ((lgb.sklearn.LGBMClassifier.__name__, lgb.sklearn.LGBMClassifier),
                                (lgb.sklearn.LGBMRegressor.__name__, lgb.sklearn.LGBMRegressor)):
            check_parameters_default_constructible(name, estimator)
            check_no_fit_attributes_set_in_init(name, estimator)
            # we cannot leave default params (see https://github.com/Microsoft/LightGBM/issues/833)
            estimator = estimator(min_child_samples=1, min_data_in_bin=1)
            for check in _yield_all_checks(name, estimator):
                if check.__name__ == 'check_estimators_nan_inf':
                    continue  # skip test because LightGBM deals with nan
                try:
                    check(name, estimator)
                except SkipTest as message:
                    warnings.warn(message, SkipTestWarning)

    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
212
    def test_pandas_categorical(self):
213
        import pandas as pd
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
                          "D": np.random.permutation([True, False] * 150)})  # bool
        y = np.random.permutation([0, 1] * 150)
        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
                               "D": np.random.permutation([True, False] * 30)})
        for col in ["A", "B", "C", "D"]:
            X[col] = X[col].astype('category')
            X_test[col] = X_test[col].astype('category')
        gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
        pred0 = list(gbm0.predict(X_test))
        gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
        pred1 = list(gbm1.predict(X_test))
        gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
        pred2 = list(gbm2.predict(X_test))
        gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
        pred3 = list(gbm3.predict(X_test))
        gbm3.booster_.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
        pred4 = list(gbm4.predict(X_test))
        pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
        np.testing.assert_almost_equal(pred0, pred1)
        np.testing.assert_almost_equal(pred0, pred2)
        np.testing.assert_almost_equal(pred0, pred3)
        np.testing.assert_almost_equal(pred_prob, pred4)
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286

    def test_predict(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                            test_size=0.2, random_state=42)

        gbm = lgb.train({'objective': 'multiclass',
                         'num_class': 3,
                         'verbose': -1},
                        lgb.Dataset(X_train, y_train))
        clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train)

        # Tests same probabilities
        res_engine = gbm.predict(X_test)
        res_sklearn = clf.predict_proba(X_test)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests same predictions
        res_engine = np.argmax(gbm.predict(X_test), axis=1)
        res_sklearn = clf.predict(X_test)
        np.testing.assert_equal(res_engine, res_sklearn)

        # Tests same raw scores
        res_engine = gbm.predict(X_test, raw_score=True)
        res_sklearn = clf.predict(X_test, raw_score=True)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests same leaf indices
        res_engine = gbm.predict(X_test, pred_leaf=True)
        res_sklearn = clf.predict(X_test, pred_leaf=True)
        np.testing.assert_equal(res_engine, res_sklearn)

        # Tests same feature contributions
        res_engine = gbm.predict(X_test, pred_contrib=True)
        res_sklearn = clf.predict(X_test, pred_contrib=True)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests other parameters for the prediction works
        res_engine = gbm.predict(X_test)
        res_sklearn_params = clf.predict_proba(X_test,
                                               pred_early_stop=True,
                                               pred_early_stop_margin=1.0)
        self.assertRaises(AssertionError,
                          np.testing.assert_allclose,
                          res_engine, res_sklearn_params)