test_sklearn.py 13.9 KB
Newer Older
wxchan's avatar
wxchan committed
1
2
# coding: utf-8
# pylint: skip-file
3
import math
4
import os
wxchan's avatar
wxchan committed
5
6
import unittest

Guolin Ke's avatar
Guolin Ke committed
7
import lightgbm as lgb
wxchan's avatar
wxchan committed
8
import numpy as np
9
from sklearn import __version__ as sk_version
wxchan's avatar
wxchan committed
10
from sklearn.base import clone
wxchan's avatar
wxchan committed
11
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
12
                              load_iris, load_svmlight_file)
13
from sklearn.exceptions import SkipTestWarning
wxchan's avatar
wxchan committed
14
from sklearn.externals import joblib
wxchan's avatar
wxchan committed
15
16
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
17
from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest,
18
                                            check_parameters_default_constructible)
wxchan's avatar
wxchan committed
19

wxchan's avatar
wxchan committed
20

21
22
23
24
25
26
def multi_error(y_true, y_pred):
    return np.mean(y_true != y_pred)


def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
wxchan's avatar
wxchan committed
27

wxchan's avatar
wxchan committed
28
29
30
31

class TestSklearn(unittest.TestCase):

    def test_binary(self):
32
33
34
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
35
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
36
        ret = log_loss(y_test, gbm.predict_proba(X_test))
wxchan's avatar
wxchan committed
37
        self.assertLess(ret, 0.15)
38
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['binary_logloss'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
39

40
    def test_regression(self):
41
42
43
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
44
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
45
46
        ret = mean_squared_error(y_test, gbm.predict(X_test))
        self.assertLess(ret, 16)
47
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
48

wxchan's avatar
wxchan committed
49
    def test_multiclass(self):
50
51
52
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
53
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
54
        ret = multi_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
55
        self.assertLess(ret, 0.2)
56
        ret = multi_logloss(y_test, gbm.predict_proba(X_test))
57
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
58

wxchan's avatar
wxchan committed
59
    def test_lambdarank(self):
60
61
62
63
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
        X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
        q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
64
65
        gbm = lgb.LGBMRanker()
        gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
66
                eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
67
                callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
wxchan's avatar
wxchan committed
68
69
70
71
72
73

    def test_regression_with_custom_objective(self):
        def objective_ls(y_true, y_pred):
            grad = (y_pred - y_true)
            hess = np.ones(len(y_true))
            return grad, hess
74
75
76
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
77
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
78
        ret = mean_squared_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
79
        self.assertLess(ret, 100)
80
        self.assertAlmostEqual(ret, gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1], places=5)
wxchan's avatar
wxchan committed
81
82
83
84
85
86
87

    def test_binary_classification_with_custom_objective(self):
        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess
88
        X, y = load_digits(2, True)
wxchan's avatar
wxchan committed
89

wxchan's avatar
wxchan committed
90
91
        def binary_error(y_test, y_pred):
            return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
92
93
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
94
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
95
        ret = binary_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
96
97
        self.assertLess(ret, 0.1)

98
    def test_dart(self):
99
100
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
101
102
103
104
        gbm = lgb.LGBMRegressor(boosting_type='dart')
        gbm.fit(X_train, y_train)
        self.assertLessEqual(gbm.score(X_train, y_train), 1.)

wxchan's avatar
wxchan committed
105
    def test_grid_search(self):
106
107
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
108
        params = {'boosting_type': ['dart', 'gbdt'],
wxchan's avatar
wxchan committed
109
110
                  'n_estimators': [5, 8],
                  'drop_rate': [0.05, 0.1]}
111
        gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
wxchan's avatar
wxchan committed
112
        gbm.fit(X_train, y_train)
wxchan's avatar
wxchan committed
113
        self.assertIn(gbm.best_params_['n_estimators'], [5, 8])
wxchan's avatar
wxchan committed
114

115
    def test_clone_and_property(self):
116
117
118
119
120
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
121
        gbm_clone = clone(gbm)
122
        self.assertIsInstance(gbm.booster_, lgb.Booster)
123
        self.assertIsInstance(gbm.feature_importances_, np.ndarray)
124
125
126
127
128

        X, y = load_digits(2, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        clf = lgb.LGBMClassifier()
        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
129
130
131
        self.assertListEqual(sorted(clf.classes_), [0, 1])
        self.assertEqual(clf.n_classes_, 2)
        self.assertIsInstance(clf.booster_, lgb.Booster)
132
        self.assertIsInstance(clf.feature_importances_, np.ndarray)
wxchan's avatar
wxchan committed
133

wxchan's avatar
wxchan committed
134
    def test_joblib(self):
135
136
137
138
139
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
140
141
        joblib.dump(gbm, 'lgb.pkl')
        gbm_pickle = joblib.load('lgb.pkl')
142
        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
wxchan's avatar
wxchan committed
143
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
144
        self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
145
146
147

        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
148
149
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
wxchan's avatar
wxchan committed
150
151
152
        for key in gbm.evals_result_:
            for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
                self.assertAlmostEqual(*evals, places=5)
wxchan's avatar
wxchan committed
153
154
155
156
157
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
        self.assertEqual(len(pred_origin), len(pred_pickle))
        for preds in zip(pred_origin, pred_pickle):
            self.assertAlmostEqual(*preds, places=5)
158
159
160
161
162
163
164

    def test_feature_importances_single_leaf(self):
        clf = lgb.LGBMClassifier(n_estimators=100)
        data = load_iris()
        clf.fit(data.data, data.target)
        importances = clf.feature_importances_
        self.assertEqual(len(importances), 4)
165

166
167
168
169
170
171
172
173
174
175
176
177
178
    def test_feature_importances_type(self):
        clf = lgb.LGBMClassifier(n_estimators=100)
        data = load_iris()
        clf.fit(data.data, data.target)
        clf.set_params(importance_type='split')
        importances_split = clf.feature_importances_
        clf.set_params(importance_type='gain')
        importances_gain = clf.feature_importances_
        # Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
        importance_split_top1 = sorted(importances_split, reverse=True)[0]
        importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
        self.assertNotEqual(importance_split_top1, importance_gain_top1)

179
    # sklearn <0.19 cannot accept instance, but many tests could be passed only with min_data=1 and min_data_in_bin=1
180
    @unittest.skipIf(sk_version < '0.19.0', 'scikit-learn version is less than 0.19')
181
    def test_sklearn_integration(self):
182
183
184
185
186
187
188
        # we cannot use `check_estimator` directly since there is no skip test mechanism
        for name, estimator in ((lgb.sklearn.LGBMClassifier.__name__, lgb.sklearn.LGBMClassifier),
                                (lgb.sklearn.LGBMRegressor.__name__, lgb.sklearn.LGBMRegressor)):
            check_parameters_default_constructible(name, estimator)
            # we cannot leave default params (see https://github.com/Microsoft/LightGBM/issues/833)
            estimator = estimator(min_child_samples=1, min_data_in_bin=1)
            for check in _yield_all_checks(name, estimator):
189
190
                check_name = check.func.__name__ if hasattr(check, 'func') else check.__name__
                if check_name == 'check_estimators_nan_inf':
191
192
193
194
195
196
197
                    continue  # skip test because LightGBM deals with nan
                try:
                    check(name, estimator)
                except SkipTest as message:
                    warnings.warn(message, SkipTestWarning)

    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
198
    def test_pandas_categorical(self):
199
        import pandas as pd
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
                          "D": np.random.permutation([True, False] * 150)})  # bool
        y = np.random.permutation([0, 1] * 150)
        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
                               "D": np.random.permutation([True, False] * 30)})
        for col in ["A", "B", "C", "D"]:
            X[col] = X[col].astype('category')
            X_test[col] = X_test[col].astype('category')
        gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
        pred0 = list(gbm0.predict(X_test))
        gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
        pred1 = list(gbm1.predict(X_test))
        gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
        pred2 = list(gbm2.predict(X_test))
        gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
        pred3 = list(gbm3.predict(X_test))
        gbm3.booster_.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
        pred4 = list(gbm4.predict(X_test))
        pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
        np.testing.assert_almost_equal(pred0, pred1)
        np.testing.assert_almost_equal(pred0, pred2)
        np.testing.assert_almost_equal(pred0, pred3)
        np.testing.assert_almost_equal(pred_prob, pred4)
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

    def test_predict(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                            test_size=0.2, random_state=42)

        gbm = lgb.train({'objective': 'multiclass',
                         'num_class': 3,
                         'verbose': -1},
                        lgb.Dataset(X_train, y_train))
        clf = lgb.LGBMClassifier(verbose=-1).fit(X_train, y_train)

        # Tests same probabilities
        res_engine = gbm.predict(X_test)
        res_sklearn = clf.predict_proba(X_test)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests same predictions
        res_engine = np.argmax(gbm.predict(X_test), axis=1)
        res_sklearn = clf.predict(X_test)
        np.testing.assert_equal(res_engine, res_sklearn)

        # Tests same raw scores
        res_engine = gbm.predict(X_test, raw_score=True)
        res_sklearn = clf.predict(X_test, raw_score=True)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests same leaf indices
        res_engine = gbm.predict(X_test, pred_leaf=True)
        res_sklearn = clf.predict(X_test, pred_leaf=True)
        np.testing.assert_equal(res_engine, res_sklearn)

        # Tests same feature contributions
        res_engine = gbm.predict(X_test, pred_contrib=True)
        res_sklearn = clf.predict(X_test, pred_contrib=True)
        np.testing.assert_allclose(res_engine, res_sklearn)

        # Tests other parameters for the prediction works
        res_engine = gbm.predict(X_test)
        res_sklearn_params = clf.predict_proba(X_test,
                                               pred_early_stop=True,
                                               pred_early_stop_margin=1.0)
        self.assertRaises(AssertionError,
                          np.testing.assert_allclose,
                          res_engine, res_sklearn_params)