test_sklearn.py 14 KB
Newer Older
wxchan's avatar
wxchan committed
1
2
# coding: utf-8
# pylint: skip-file
3
import math
4
import os
wxchan's avatar
wxchan committed
5
6
import unittest

Guolin Ke's avatar
Guolin Ke committed
7
import lightgbm as lgb
wxchan's avatar
wxchan committed
8
import numpy as np
wxchan's avatar
wxchan committed
9
from sklearn.base import clone
wxchan's avatar
wxchan committed
10
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
11
                              load_iris, load_svmlight_file)
wxchan's avatar
wxchan committed
12
from sklearn.externals import joblib
wxchan's avatar
wxchan committed
13
14
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
15
from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest,
16
17
18
19
20
21
                                            check_parameters_default_constructible)
try:
    from sklearn.utils.estimator_checks import check_no_fit_attributes_set_in_init
    sklearn_at_least_019 = True
except ImportError:
    sklearn_at_least_019 = False
22
23
24
25
26
try:
    import pandas as pd
    IS_PANDAS_INSTALLED = True
except ImportError:
    IS_PANDAS_INSTALLED = False
wxchan's avatar
wxchan committed
27

wxchan's avatar
wxchan committed
28

29
30
31
32
33
34
def multi_error(y_true, y_pred):
    return np.mean(y_true != y_pred)


def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
wxchan's avatar
wxchan committed
35

wxchan's avatar
wxchan committed
36
37
38
39

class TestSklearn(unittest.TestCase):

    def test_binary(self):
40
41
42
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
43
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
44
        ret = log_loss(y_test, gbm.predict_proba(X_test))
wxchan's avatar
wxchan committed
45
        self.assertLess(ret, 0.15)
46
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
47

48
    def test_regression(self):
49
50
51
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
52
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
53
54
        ret = mean_squared_error(y_test, gbm.predict(X_test))
        self.assertLess(ret, 16)
55
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
56

wxchan's avatar
wxchan committed
57
    def test_multiclass(self):
58
59
60
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
61
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
62
        ret = multi_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
63
        self.assertLess(ret, 0.2)
64
        ret = multi_logloss(y_test, gbm.predict_proba(X_test))
65
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
66

wxchan's avatar
wxchan committed
67
    def test_lambdarank(self):
68
69
70
71
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
        X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
        q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
72
73
        gbm = lgb.LGBMRanker()
        gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
74
                eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
75
                callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
wxchan's avatar
wxchan committed
76
77
78
79
80
81

    def test_regression_with_custom_objective(self):
        def objective_ls(y_true, y_pred):
            grad = (y_pred - y_true)
            hess = np.ones(len(y_true))
            return grad, hess
82
83
84
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
85
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
86
        ret = mean_squared_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
87
        self.assertLess(ret, 100)
88
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
89
90
91
92
93
94
95

    def test_binary_classification_with_custom_objective(self):
        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess
96
        X, y = load_digits(2, True)
wxchan's avatar
wxchan committed
97

wxchan's avatar
wxchan committed
98
99
        def binary_error(y_test, y_pred):
            return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
100
101
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
102
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
103
        ret = binary_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
104
105
        self.assertLess(ret, 0.1)

106
    def test_dart(self):
107
108
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
109
110
111
112
        gbm = lgb.LGBMRegressor(boosting_type='dart')
        gbm.fit(X_train, y_train)
        self.assertLessEqual(gbm.score(X_train, y_train), 1.)

wxchan's avatar
wxchan committed
113
    def test_grid_search(self):
114
115
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
116
        params = {'boosting_type': ['dart', 'gbdt'],
wxchan's avatar
wxchan committed
117
118
                  'n_estimators': [5, 8],
                  'drop_rate': [0.05, 0.1]}
119
        gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
wxchan's avatar
wxchan committed
120
        gbm.fit(X_train, y_train)
wxchan's avatar
wxchan committed
121
        self.assertIn(gbm.best_params_['n_estimators'], [5, 8])
wxchan's avatar
wxchan committed
122

123
    def test_clone_and_property(self):
124
125
126
127
128
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
129
        gbm_clone = clone(gbm)
130
        self.assertIsInstance(gbm.booster_, lgb.Booster)
131
        self.assertIsInstance(gbm.feature_importances_, np.ndarray)
132
133
134
135
136

        X, y = load_digits(2, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        clf = lgb.LGBMClassifier()
        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
137
138
139
        self.assertListEqual(sorted(clf.classes_), [0, 1])
        self.assertEqual(clf.n_classes_, 2)
        self.assertIsInstance(clf.booster_, lgb.Booster)
140
        self.assertIsInstance(clf.feature_importances_, np.ndarray)
wxchan's avatar
wxchan committed
141

wxchan's avatar
wxchan committed
142
    def test_joblib(self):
143
144
145
146
147
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
148
149
        joblib.dump(gbm, 'lgb.pkl')
        gbm_pickle = joblib.load('lgb.pkl')
150
        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
wxchan's avatar
wxchan committed
151
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
152
        self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
153
154
155

        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
156
157
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
wxchan's avatar
wxchan committed
158
159
160
        for key in gbm.evals_result_:
            for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
                self.assertAlmostEqual(*evals, places=5)
wxchan's avatar
wxchan committed
161
162
163
164
165
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
        self.assertEqual(len(pred_origin), len(pred_pickle))
        for preds in zip(pred_origin, pred_pickle):
            self.assertAlmostEqual(*preds, places=5)
166
167
168
169
170
171
172

    def test_feature_importances_single_leaf(self):
        clf = lgb.LGBMClassifier(n_estimators=100)
        data = load_iris()
        clf.fit(data.data, data.target)
        importances = clf.feature_importances_
        self.assertEqual(len(importances), 4)
173
174
175
176
177
178
179
180
181
182
183
184

    def test_sklearn_backward_compatibility(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

        # Tests that `seed` is the same as `random_state`
        clf_1 = lgb.sklearn.LGBMClassifier(seed=42, subsample=0.6, colsample_bytree=0.8)
        clf_2 = lgb.sklearn.LGBMClassifier(random_state=42, subsample=0.6, colsample_bytree=0.8)
        y_pred_1 = clf_1.fit(X_train, y_train).predict_proba(X_test)
        y_pred_2 = clf_2.fit(X_train, y_train).predict_proba(X_test)
        np.testing.assert_allclose(y_pred_1, y_pred_2)

185
    def test_sklearn_integration(self):
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
        # sklearn <0.19 cannot accept instance, but many tests could be passed only with min_data=1 and min_data_in_bin=1
        if sklearn_at_least_019:
            # we cannot use `check_estimator` directly since there is no skip test mechanism
            for name, estimator in ((lgb.sklearn.LGBMClassifier.__name__, lgb.sklearn.LGBMClassifier),
                                    (lgb.sklearn.LGBMRegressor.__name__, lgb.sklearn.LGBMRegressor)):
                check_parameters_default_constructible(name, estimator)
                check_no_fit_attributes_set_in_init(name, estimator)
                # we cannot leave default params (see https://github.com/Microsoft/LightGBM/issues/833)
                estimator = estimator(min_child_samples=1, min_data_in_bin=1)
                for check in _yield_all_checks(name, estimator):
                    if check.__name__ == 'check_estimators_nan_inf':
                        continue  # skip test because LightGBM deals with nan
                    try:
                        check(name, estimator)
                    except SkipTest as message:
                        warnings.warn(message, SkipTestWarning)
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232

    @unittest.skipIf(not IS_PANDAS_INSTALLED, 'pandas not installed')
    def test_pandas_categorical(self):
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
                          "D": np.random.permutation([True, False] * 150)})  # bool
        y = np.random.permutation([0, 1] * 150)
        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
                               "D": np.random.permutation([True, False] * 30)})
        for col in ["A", "B", "C", "D"]:
            X[col] = X[col].astype('category')
            X_test[col] = X_test[col].astype('category')
        gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
        pred0 = list(gbm0.predict(X_test))
        gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
        pred1 = list(gbm1.predict(X_test))
        gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
        pred2 = list(gbm2.predict(X_test))
        gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
        pred3 = list(gbm3.predict(X_test))
        gbm3.booster_.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
        pred4 = list(gbm4.predict(X_test))
        pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
        np.testing.assert_almost_equal(pred0, pred1)
        np.testing.assert_almost_equal(pred0, pred2)
        np.testing.assert_almost_equal(pred0, pred3)
        np.testing.assert_almost_equal(pred_prob, pred4)
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

    def test_predict(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                            test_size=0.2, random_state=42)

        gbm = lgb.train({'objective': 'multiclass',
                         'num_class': 3,
                         'verbose': -1},
                        lgb.Dataset(X_train, y_train))
        clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train)

        # Tests same probabilities
        res_engine = gbm.predict(X_test)
        res_sklearn = clf.predict_proba(X_test)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests same predictions
        res_engine = np.argmax(gbm.predict(X_test), axis=1)
        res_sklearn = clf.predict(X_test)
        np.testing.assert_equal(res_engine, res_sklearn)

        # Tests same raw scores
        res_engine = gbm.predict(X_test, raw_score=True)
        res_sklearn = clf.predict(X_test, raw_score=True)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests same leaf indices
        res_engine = gbm.predict(X_test, pred_leaf=True)
        res_sklearn = clf.predict(X_test, pred_leaf=True)
        np.testing.assert_equal(res_engine, res_sklearn)

        # Tests same feature contributions
        res_engine = gbm.predict(X_test, pred_contrib=True)
        res_sklearn = clf.predict(X_test, pred_contrib=True)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests other parameters for the prediction works
        res_engine = gbm.predict(X_test)
        res_sklearn_params = clf.predict_proba(X_test,
                                               pred_early_stop=True,
                                               pred_early_stop_margin=1.0)
        self.assertRaises(AssertionError,
                          np.testing.assert_allclose,
                          res_engine, res_sklearn_params)