test_sklearn.py 8.88 KB
Newer Older
wxchan's avatar
wxchan committed
1
2
# coding: utf-8
# pylint: skip-file
3
import math
4
import os
wxchan's avatar
wxchan committed
5
import unittest
6
import warnings
wxchan's avatar
wxchan committed
7

Guolin Ke's avatar
Guolin Ke committed
8
import lightgbm as lgb
wxchan's avatar
wxchan committed
9
import numpy as np
wxchan's avatar
wxchan committed
10
from sklearn.base import clone
wxchan's avatar
wxchan committed
11
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
12
                              load_iris, load_svmlight_file)
wxchan's avatar
wxchan committed
13
from sklearn.externals import joblib
wxchan's avatar
wxchan committed
14
15
16
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split

wxchan's avatar
wxchan committed
17

18
19
20
21
22
23
def multi_error(y_true, y_pred):
    return np.mean(y_true != y_pred)


def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
wxchan's avatar
wxchan committed
24

wxchan's avatar
wxchan committed
25
26
27
28

class TestSklearn(unittest.TestCase):

    def test_binary(self):
29
30
31
        X, y = load_breast_cancer(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
32
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
33
        ret = log_loss(y_test, gbm.predict_proba(X_test))
wxchan's avatar
wxchan committed
34
        self.assertLess(ret, 0.15)
35
        self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['binary_logloss'][gbm.best_iteration - 1], places=5)
wxchan's avatar
wxchan committed
36
37

    def test_regreesion(self):
38
39
40
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True)
41
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
42
43
        ret = mean_squared_error(y_test, gbm.predict(X_test))
        self.assertLess(ret, 16)
44
        self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['l2'][gbm.best_iteration - 1], places=5)
wxchan's avatar
wxchan committed
45

wxchan's avatar
wxchan committed
46
    def test_multiclass(self):
47
48
49
        X, y = load_digits(10, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True)
50
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
51
        ret = multi_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
52
        self.assertLess(ret, 0.2)
53
        ret = multi_logloss(y_test, gbm.predict_proba(X_test))
54
        self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['multi_logloss'][gbm.best_iteration - 1], places=5)
wxchan's avatar
wxchan committed
55

wxchan's avatar
wxchan committed
56
    def test_lambdarank(self):
57
58
59
60
        X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
        X_test, y_test = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test'))
        q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
        q_test = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.test.query'))
61
62
        gbm = lgb.LGBMRanker()
        gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)],
63
                eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=5, verbose=False,
64
                callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
wxchan's avatar
wxchan committed
65
66
67
68
69
70

    def test_regression_with_custom_objective(self):
        def objective_ls(y_true, y_pred):
            grad = (y_pred - y_true)
            hess = np.ones(len(y_true))
            return grad, hess
71
72
73
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=50, silent=True, objective=objective_ls)
74
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
75
        ret = mean_squared_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
76
        self.assertLess(ret, 100)
77
        self.assertAlmostEqual(ret, gbm.evals_result['valid_0']['l2'][gbm.best_iteration - 1], places=5)
wxchan's avatar
wxchan committed
78
79
80
81
82
83
84

    def test_binary_classification_with_custom_objective(self):
        def logregobj(y_true, y_pred):
            y_pred = 1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred - y_true
            hess = y_pred * (1.0 - y_pred)
            return grad, hess
85
        X, y = load_digits(2, True)
wxchan's avatar
wxchan committed
86

wxchan's avatar
wxchan committed
87
88
        def binary_error(y_test, y_pred):
            return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
89
90
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMClassifier(n_estimators=50, silent=True, objective=logregobj)
91
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5, verbose=False)
92
        ret = binary_error(y_test, gbm.predict(X_test))
wxchan's avatar
wxchan committed
93
94
        self.assertLess(ret, 0.1)

95
    def test_dart(self):
96
97
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
98
99
100
101
        gbm = lgb.LGBMRegressor(boosting_type='dart')
        gbm.fit(X_train, y_train)
        self.assertLessEqual(gbm.score(X_train, y_train), 1.)

wxchan's avatar
wxchan committed
102
    def test_grid_search(self):
103
104
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
105
        params = {'boosting_type': ['dart', 'gbdt'],
wxchan's avatar
wxchan committed
106
107
                  'n_estimators': [5, 8],
                  'drop_rate': [0.05, 0.1]}
108
        gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
wxchan's avatar
wxchan committed
109
        gbm.fit(X_train, y_train)
wxchan's avatar
wxchan committed
110
        self.assertIn(gbm.best_params_['n_estimators'], [5, 8])
wxchan's avatar
wxchan committed
111

112
    def test_clone_and_property(self):
113
114
115
116
117
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
118
        gbm_clone = clone(gbm)
119
        self.assertIsInstance(gbm.booster_, lgb.Booster)
120
        self.assertIsInstance(gbm.feature_importances_, np.ndarray)
121
122
123
124
125

        X, y = load_digits(2, True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        clf = lgb.LGBMClassifier()
        clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
126
127
128
        self.assertListEqual(sorted(clf.classes_), [0, 1])
        self.assertEqual(clf.n_classes_, 2)
        self.assertIsInstance(clf.booster_, lgb.Booster)
129
        self.assertIsInstance(clf.feature_importances_, np.ndarray)
wxchan's avatar
wxchan committed
130

wxchan's avatar
wxchan committed
131
    def test_joblib(self):
132
133
134
135
136
        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
        gbm = lgb.LGBMRegressor(n_estimators=100, silent=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)

wxchan's avatar
wxchan committed
137
138
        joblib.dump(gbm, 'lgb.pkl')
        gbm_pickle = joblib.load('lgb.pkl')
139
        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
wxchan's avatar
wxchan committed
140
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
141
        self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
142
143
144

        X, y = load_boston(True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
wxchan's avatar
wxchan committed
145
146
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
wxchan's avatar
wxchan committed
147
148
149
        for key in gbm.evals_result_:
            for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
                self.assertAlmostEqual(*evals, places=5)
wxchan's avatar
wxchan committed
150
151
152
153
154
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
        self.assertEqual(len(pred_origin), len(pred_pickle))
        for preds in zip(pred_origin, pred_pickle):
            self.assertAlmostEqual(*preds, places=5)
155
156
157
158
159
160
161

    def test_feature_importances_single_leaf(self):
        clf = lgb.LGBMClassifier(n_estimators=100)
        data = load_iris()
        clf.fit(data.data, data.target)
        importances = clf.feature_importances_
        self.assertEqual(len(importances), 4)
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

    def test_sklearn_backward_compatibility(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

        # Tests that `seed` is the same as `random_state`
        clf_1 = lgb.sklearn.LGBMClassifier(seed=42, subsample=0.6, colsample_bytree=0.8)
        clf_2 = lgb.sklearn.LGBMClassifier(random_state=42, subsample=0.6, colsample_bytree=0.8)
        y_pred_1 = clf_1.fit(X_train, y_train).predict_proba(X_test)
        y_pred_2 = clf_2.fit(X_train, y_train).predict_proba(X_test)
        np.testing.assert_allclose(y_pred_1, y_pred_2)

        # Tests that warnings were raised
        with warnings.catch_warnings(record=True) as w:
            clf_1.get_params()
            clf_2.set_params(nthread=-1).fit(X_train, y_train)
            self.assertEqual(len(w), 2)
            self.assertTrue(issubclass(w[-1].category, Warning))