"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "866a2f91516f532899729bd22f0626df17558e34"
sklearn.py 12.8 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import

import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .engine import train
# sklearn
try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
    LGBMClassifierBase = ClassifierMixin
    LGBMLabelEncoder = LabelEncoder
except ImportError:
    SKLEARN_INSTALLED = False
    LGBMModelBase = object
    LGBMClassifierBase = object
    LGBMRegressorBase = object
    LGBMLabelEncoder = None

def _objective_decorator(func):
    """Decorate an objective function

Guolin Ke's avatar
Guolin Ke committed
27
    Converts an objective function using the typical sklearn metrics to LightGBM fobj
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

    Note: for multi-class task, the label/pred is group by class_id first, then group by row_id
          if you want to get i-th row label/pred in j-th class, the access way is label/pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
        Expects a callable with signature ``func(y_true, y_pred)``:

        y_true: array_like of shape [n_samples]
            The target values
        y_pred: array_like of shape [n_samples]
            The predicted values

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

        preds: array_like, shape [n_samples]
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
        return func(labels, preds)
    return inner

class LGBMModel(LGBMModelBase):
    """Implementation of the Scikit-Learn API for LightGBM.

    Parameters
    ----------
    num_leaves : int
        Maximum tree leaves for base learners.
    max_depth : int
        Maximum tree depth for base learners, -1 means not limit. 
    learning_rate : float
        Boosting learning rate 
    n_estimators : int
        Number of boosted trees to fit.
    silent : boolean
        Whether to print messages while running boosting.
    objective : string or callable
        Specify the learning task and the corresponding learning objective or
        a custom objective function to be used (see note below).
    nthread : int
        Number of parallel threads 
    gamma : float
        Minimum loss reduction required to make a further partition on a leaf node of the tree.
    min_child_weight : int
Guolin Ke's avatar
Guolin Ke committed
83
84
85
        Minimum sum of instance weight(hessian) needed in a child(leaf)
    min_data : int
        Minimum number of data need in a child(leaf)
Guolin Ke's avatar
Guolin Ke committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
    subsample : float
        Subsample ratio of the training instance.
    subsample_freq : int
        frequence of subsample, <=0 means no enable
    colsample_bytree : float
        Subsample ratio of columns when constructing each tree.
    colsample_byleaf : float
        Subsample ratio of columns when constructing each leaf.
    reg_alpha : float 
        L1 regularization term on weights
    reg_lambda : float 
        L2 regularization term on weights
    scale_pos_weight : float
        Balancing of positive and negative weights.
    is_unbalance : bool
        Is unbalance for binary classification
    seed : int
        Random number seed.

    Note
    ----
    A custom objective function can be provided for the ``objective``
    parameter. In this case, it should have the signature
    ``objective(y_true, y_pred) -> grad, hess``:

    y_true: array_like of shape [n_samples]
        The target values
    y_pred: array_like of shape [n_samples]
        The predicted values

    grad: array_like of shape [n_samples]
        The value of the gradient for each sample point.
    hess: array_like of shape [n_samples]
        The value of the second derivative for each sample point

    for multi-class task, the label/pred is group by class_id first, then group by row_id
          if you want to get i-th row label/pred in j-th class, the access way is label/pred[j*num_data+i]
          and you should group grad and hess in this way as well
    """

Guolin Ke's avatar
Guolin Ke committed
126
127
    def __init__(self, num_leaves=31, max_depth=-1, 
                 learning_rate=0.1, n_estimators=10, max_bin=255,
Guolin Ke's avatar
Guolin Ke committed
128
                 silent=True, objective="regression",  
Guolin Ke's avatar
Guolin Ke committed
129
                 nthread=-1, gamma=0, min_child_weight=5, min_data=10,
Guolin Ke's avatar
Guolin Ke committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
                 subsample=1, subsample_freq=1, colsample_bytree=1, colsample_byleaf=1, 
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1, 
                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
            raise LightGBMError('sklearn needs to be installed in order to use this module')

        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
        self.silent = silent
        self.objective = objective
        self.nthread = nthread
        self.gamma = gamma
        self.min_child_weight = min_child_weight
Guolin Ke's avatar
Guolin Ke committed
146
        self.min_data = min_data
Guolin Ke's avatar
Guolin Ke committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.colsample_byleaf = colsample_byleaf
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
        self._Booster = None

    def booster(self):
        """Get the underlying lightgbm Booster of this model.

        This will raise an exception when fit was not called

        Returns
        -------
        booster : a lightgbm booster of underlying model
        """
        if self._Booster is None:
            raise LightGBMError('need to call fit beforehand')
        return self._Booster

    def get_params(self, deep=False):
Guolin Ke's avatar
Guolin Ke committed
172
        """Get parameters"""
Guolin Ke's avatar
Guolin Ke committed
173
174
175
176
177
178
179
        params = super(LGBMModel, self).get_params(deep=deep)
        params['verbose'] = 0 if self.silent else 1
        if self.nthread <= 0:
            params.pop('nthread', None)
        return params

    def fit(self, X, y, eval_set=None, eval_metric=None,
Guolin Ke's avatar
Guolin Ke committed
180
            early_stopping_rounds=None, verbose=True, train_fields=None, valid_fields=None, other_params=None):
Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
            doc/parameter.md. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            Dataset fobject such that you may need to call the get_label
            method. And it must return (eval_name, feature_result, is_bigger_better)
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
Guolin Ke's avatar
Guolin Ke committed
201
202
203
204
205
206
207
208
        train_fields : dict
            other data file in training data. e.g. train_fields['weight'] is weight data
            support fields: weight, group, init_score
        valid_fields : dict
            other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
            support fields: weight, group, init_score
        other_params: dict
            other parameters
Guolin Ke's avatar
Guolin Ke committed
209
210
211
212
        """
        evals_result = {}
        params = self.get_params()

Guolin Ke's avatar
Guolin Ke committed
213
214
215
        if other_params is not None:
            params.update(other_params)

Guolin Ke's avatar
Guolin Ke committed
216
217
218
219
        if callable(self.objective):
            fobj = _objective_decorator(self.objective)
            params["objective"] = "None"
        else:
Guolin Ke's avatar
Guolin Ke committed
220
            params["objective"] = self.objective
Guolin Ke's avatar
Guolin Ke committed
221
222
223
            fobj = None
        if callable(eval_metric):
            feval = eval_metric
Guolin Ke's avatar
Guolin Ke committed
224
        elif is_str(eval_metric) or isinstance(eval_metric, list):
Guolin Ke's avatar
Guolin Ke committed
225
226
            feval = None
            params.update({'metric': eval_metric})
Guolin Ke's avatar
Guolin Ke committed
227
228
        else:
            feval = None
Guolin Ke's avatar
Guolin Ke committed
229
230
231
232
233
234
        feval = eval_metric if callable(eval_metric) else None

        self._Booster = train(params, (X, y),
                              self.n_estimators, valid_datas=eval_set,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
235
                              verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields)
Guolin Ke's avatar
Guolin Ke committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        return self.booster().predict(data,
                                      raw_score=raw_score,
                                      num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
        """Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        ntree_limit : int
            Limit number of trees in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
        return self.booster().predict(X,
                                      pred_leaf=True,
                                      num_iteration=num_iteration)

    def evals_result(self):
        """Return the evaluation results.
        Returns
        -------
        evals_result : dictionary
        """
        if self.evals_result_:
            evals_result = self.evals_result_
        else:
            raise LightGBMError('No results.')

Guolin Ke's avatar
Guolin Ke committed
282
283
284
285
286
287
288
289
        return evals_result


class LGBMRegressor(LGBMModel, LGBMRegressorBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

class LGBMClassifier(LGBMModel, LGBMClassifierBase):
Guolin Ke's avatar
Guolin Ke committed
290
    __doc__ = """Implementation of the scikit-learn API for LightGBM classification.
Guolin Ke's avatar
Guolin Ke committed
291
292
293
294
295
296
297
298
299
300
301
302
303

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

    def fit(self, X, y, eval_set=None, eval_metric=None,
            early_stopping_rounds=None, verbose=True, 
            train_fields=None, valid_fields=None, other_params=None):

        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        if other_params is None:
            other_params = {}
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
Guolin Ke's avatar
Guolin Ke committed
304
305
            if not callable(self.objective):
                self.objective = "multiclass"
Guolin Ke's avatar
Guolin Ke committed
306
307
            other_params['num_class'] = self.n_classes_
        else:
Guolin Ke's avatar
Guolin Ke committed
308
309
            if not callable(self.objective):
                self.objective = "binary"
Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342

        self._le = LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)

        if eval_set is not None:
            eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )

        super(LGBMClassifier, self).fit(X, training_labels, eval_set, eval_metric, 
            early_stopping_rounds, verbose, train_fields, valid_fields, other_params)
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

    def predict_proba(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if self.n_classes_ > 2:
            return class_probs
        else:
            classone_probs = class_probs
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs, classone_probs)).transpose()