sklearn.py 16.6 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import

import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .engine import train
# sklearn
try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
    LGBMClassifierBase = ClassifierMixin
    LGBMLabelEncoder = LabelEncoder
except ImportError:
    SKLEARN_INSTALLED = False
    LGBMModelBase = object
    LGBMClassifierBase = object
    LGBMRegressorBase = object
    LGBMLabelEncoder = None

Guolin Ke's avatar
Guolin Ke committed
24
def _point_wise_objective(func):
Guolin Ke's avatar
Guolin Ke committed
25
    """Decorate an objective function
Guolin Ke's avatar
Guolin Ke committed
26
27
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
32
33
34
35
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
        Expects a callable with signature ``func(y_true, y_pred)``:

        y_true: array_like of shape [n_samples]
            The target values
Guolin Ke's avatar
Guolin Ke committed
36
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
Guolin Ke's avatar
Guolin Ke committed
37
38
            The predicted values

Guolin Ke's avatar
Guolin Ke committed
39

Guolin Ke's avatar
Guolin Ke committed
40
41
42
43
44
45
    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

Guolin Ke's avatar
Guolin Ke committed
46
        preds: array_like, shape [n_samples] or shape[n_samples* n_class]
Guolin Ke's avatar
Guolin Ke committed
47
48
49
50
51
52
53
54
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
59
60
61
62
63
64
65
        grad, hess = func(labels, preds)
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
Guolin Ke's avatar
Guolin Ke committed
66
                if num_class * num_data != len(grad):
Guolin Ke's avatar
Guolin Ke committed
67
                    raise ValueError("length of grad and hess should equal with num_class * num_data")
Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
72
73
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
80
81
82
83
    return inner

class LGBMModel(LGBMModelBase):
    """Implementation of the Scikit-Learn API for LightGBM.

    Parameters
    ----------
    num_leaves : int
        Maximum tree leaves for base learners.
    max_depth : int
Guolin Ke's avatar
Guolin Ke committed
84
        Maximum tree depth for base learners, -1 means no limit. 
Guolin Ke's avatar
Guolin Ke committed
85
86
87
88
89
90
91
92
93
94
95
    learning_rate : float
        Boosting learning rate 
    n_estimators : int
        Number of boosted trees to fit.
    silent : boolean
        Whether to print messages while running boosting.
    objective : string or callable
        Specify the learning task and the corresponding learning objective or
        a custom objective function to be used (see note below).
    nthread : int
        Number of parallel threads 
Guolin Ke's avatar
Guolin Ke committed
96
    min_split_gain : float
Guolin Ke's avatar
Guolin Ke committed
97
98
        Minimum loss reduction required to make a further partition on a leaf node of the tree.
    min_child_weight : int
Guolin Ke's avatar
Guolin Ke committed
99
        Minimum sum of instance weight(hessian) needed in a child(leaf)
Guolin Ke's avatar
Guolin Ke committed
100
    min_child_samples : int
Guolin Ke's avatar
Guolin Ke committed
101
        Minimum number of data need in a child(leaf)
Guolin Ke's avatar
Guolin Ke committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    subsample : float
        Subsample ratio of the training instance.
    subsample_freq : int
        frequence of subsample, <=0 means no enable
    colsample_bytree : float
        Subsample ratio of columns when constructing each tree.
    reg_alpha : float 
        L1 regularization term on weights
    reg_lambda : float 
        L2 regularization term on weights
    scale_pos_weight : float
        Balancing of positive and negative weights.
    is_unbalance : bool
        Is unbalance for binary classification
    seed : int
        Random number seed.

    Note
    ----
    A custom objective function can be provided for the ``objective``
    parameter. In this case, it should have the signature
    ``objective(y_true, y_pred) -> grad, hess``:

Guolin Ke's avatar
Guolin Ke committed
125
    y_true: array_like of shape [n_samples] 
Guolin Ke's avatar
Guolin Ke committed
126
        The target values
Guolin Ke's avatar
Guolin Ke committed
127
    y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
Guolin Ke's avatar
Guolin Ke committed
128
129
        The predicted values

Guolin Ke's avatar
Guolin Ke committed
130
    grad: array_like of shape [n_samples] or shape[n_samples* n_class]
Guolin Ke's avatar
Guolin Ke committed
131
        The value of the gradient for each sample point.
Guolin Ke's avatar
Guolin Ke committed
132
    hess: array_like of shape [n_samples] or shape[n_samples* n_class]
Guolin Ke's avatar
Guolin Ke committed
133
134
        The value of the second derivative for each sample point

Guolin Ke's avatar
Guolin Ke committed
135
136
    for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
Guolin Ke's avatar
Guolin Ke committed
137
138
139
          and you should group grad and hess in this way as well
    """

Guolin Ke's avatar
Guolin Ke committed
140
141
    def __init__(self, num_leaves=31, max_depth=-1, 
                 learning_rate=0.1, n_estimators=10, max_bin=255,
Guolin Ke's avatar
Guolin Ke committed
142
                 silent=True, objective="regression",  
Guolin Ke's avatar
Guolin Ke committed
143
144
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
Guolin Ke's avatar
Guolin Ke committed
145
146
147
148
149
150
151
152
153
154
155
156
157
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1, 
                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
            raise LightGBMError('sklearn needs to be installed in order to use this module')

        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
        self.silent = silent
        self.objective = objective
        self.nthread = nthread
Guolin Ke's avatar
Guolin Ke committed
158
        self.min_split_gain = min_split_gain
Guolin Ke's avatar
Guolin Ke committed
159
        self.min_child_weight = min_child_weight
Guolin Ke's avatar
Guolin Ke committed
160
        self.min_child_samples = min_child_samples
Guolin Ke's avatar
Guolin Ke committed
161
162
163
164
165
166
167
168
169
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
        self._Booster = None
Guolin Ke's avatar
Guolin Ke committed
170
171
172
173
        if callable(self.objective):
            self.fobj = _point_wise_objective(self.objective)
        else:
            self.fobj = None
Guolin Ke's avatar
Guolin Ke committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

    def booster(self):
        """Get the underlying lightgbm Booster of this model.

        This will raise an exception when fit was not called

        Returns
        -------
        booster : a lightgbm booster of underlying model
        """
        if self._Booster is None:
            raise LightGBMError('need to call fit beforehand')
        return self._Booster

    def get_params(self, deep=False):
Guolin Ke's avatar
Guolin Ke committed
189
        """Get parameters"""
Guolin Ke's avatar
Guolin Ke committed
190
191
192
193
194
195
196
        params = super(LGBMModel, self).get_params(deep=deep)
        params['verbose'] = 0 if self.silent else 1
        if self.nthread <= 0:
            params.pop('nthread', None)
        return params

    def fit(self, X, y, eval_set=None, eval_metric=None,
Guolin Ke's avatar
Guolin Ke committed
197
198
            early_stopping_rounds=None, verbose=True,
            train_fields=None, valid_fields=None, other_params=None):
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
204
205
206
207
208
209
210
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
        eval_metric : str, list of str, callable, optional
Guolin Ke's avatar
Guolin Ke committed
211
212
213
            If a str, should be a built-in evaluation metric to use.
            If callable, a custom evaluation metric. The call
            signature is func(y_predicted, dataset) where dataset will be a
Guolin Ke's avatar
Guolin Ke committed
214
            Dataset fobject such that you may need to call the get_label
Guolin Ke's avatar
Guolin Ke committed
215
            method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
Guolin Ke's avatar
Guolin Ke committed
216
217
218
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
Guolin Ke's avatar
Guolin Ke committed
219
220
221
222
223
224
225
226
        train_fields : dict
            other data file in training data. e.g. train_fields['weight'] is weight data
            support fields: weight, group, init_score
        valid_fields : dict
            other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
            support fields: weight, group, init_score
        other_params: dict
            other parameters
Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
        """
        evals_result = {}
        params = self.get_params()

Guolin Ke's avatar
Guolin Ke committed
231
232
233
        if other_params is not None:
            params.update(other_params)

Guolin Ke's avatar
Guolin Ke committed
234
        if self.fobj:
Guolin Ke's avatar
Guolin Ke committed
235
236
            params["objective"] = "None"
        else:
Guolin Ke's avatar
Guolin Ke committed
237
            params["objective"] = self.objective
Guolin Ke's avatar
Guolin Ke committed
238

Guolin Ke's avatar
Guolin Ke committed
239
240
        if callable(eval_metric):
            feval = eval_metric
Guolin Ke's avatar
Guolin Ke committed
241
        elif is_str(eval_metric) or isinstance(eval_metric, list):
Guolin Ke's avatar
Guolin Ke committed
242
243
            feval = None
            params.update({'metric': eval_metric})
Guolin Ke's avatar
Guolin Ke committed
244
245
        else:
            feval = None
Guolin Ke's avatar
Guolin Ke committed
246
247
248
249
250
        feval = eval_metric if callable(eval_metric) else None

        self._Booster = train(params, (X, y),
                              self.n_estimators, valid_datas=eval_set,
                              early_stopping_rounds=early_stopping_rounds,
Guolin Ke's avatar
Guolin Ke committed
251
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
252
                              verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields)
Guolin Ke's avatar
Guolin Ke committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        return self.booster().predict(data,
                                      raw_score=raw_score,
                                      num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
        """Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        ntree_limit : int
            Limit number of trees in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
        return self.booster().predict(X,
                                      pred_leaf=True,
                                      num_iteration=num_iteration)

    def evals_result(self):
        """Return the evaluation results.
        Returns
        -------
        evals_result : dictionary
        """
        if self.evals_result_:
            evals_result = self.evals_result_
        else:
            raise LightGBMError('No results.')

Guolin Ke's avatar
Guolin Ke committed
299
300
301
302
303
304
305
306
        return evals_result


class LGBMRegressor(LGBMModel, LGBMRegressorBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

class LGBMClassifier(LGBMModel, LGBMClassifierBase):
Guolin Ke's avatar
Guolin Ke committed
307
    __doc__ = """Implementation of the scikit-learn API for LightGBM classification.
Guolin Ke's avatar
Guolin Ke committed
308
309
310
311

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

    def fit(self, X, y, eval_set=None, eval_metric=None,
Guolin Ke's avatar
Guolin Ke committed
312
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
313
314
315
316
317
318
319
320
            train_fields=None, valid_fields=None, other_params=None):

        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        if other_params is None:
            other_params = {}
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
Guolin Ke's avatar
Guolin Ke committed
321
            self.objective = "multiclass"
Guolin Ke's avatar
Guolin Ke committed
322
323
            other_params['num_class'] = self.n_classes_
        else:
Guolin Ke's avatar
Guolin Ke committed
324
            self.objective = "binary"
Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
329
330
331

        self._le = LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)

        if eval_set is not None:
            eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )

Guolin Ke's avatar
Guolin Ke committed
332
333
334
335
        super(LGBMClassifier, self).fit(X, training_labels, eval_set,
                                        eval_metric, early_stopping_rounds,
                                        verbose, train_fields, valid_fields,
                                        other_params)
Guolin Ke's avatar
Guolin Ke committed
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

    def predict_proba(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if self.n_classes_ > 2:
            return class_probs
        else:
            classone_probs = class_probs
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs, classone_probs)).transpose()

Guolin Ke's avatar
Guolin Ke committed
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410

def _group_wise_objective(func):
    """Decorate an objective function
    Parameters
    ----------
    func: callable
        Expects a callable with signature ``func(y_true, group, y_pred)``:

        y_true: array_like of shape [n_samples]
            The target values
        group : array_like of shape
            group size data of data
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values
    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

        preds: array_like, shape [n_samples] or shape[n_samples* n_class]
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
        group = dataset.get_group()
        if group is None:
            raise ValueError("group should not be None for ranking task")
        grad, hess = func(labels, group, preds)
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                raise ValueError("lenght of grad and hess should equal with num_data")
        return grad, hess
    return inner

class LGBMRanker(LGBMModel):
    __doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

    def fit(self, X, y, eval_set=None, eval_metric=None,
Guolin Ke's avatar
Guolin Ke committed
411
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
            train_fields=None, valid_fields=None, other_params=None):

        """check group data"""
        if "group" not in train_fields:
            raise ValueError("should set group in train_fields for ranking task")

        if eval_set is not None:
            if valid_fields is None:
                raise ValueError("valid_fields cannot be None when eval_set is not None")
            elif len(valid_fields) != len(eval_set):
                raise ValueError("lenght of valid_fields should equal with eval_set")
            else:
                for inner in valid_fields:
                    if "group" not in inner:
                        raise ValueError("should set group in valid_fields for ranking task")

        if callable(self.objective):
            self.fobj = _group_wise_objective(self.objective)
        else:
            self.objective = "lambdarank"
            self.fobj = None

Guolin Ke's avatar
Guolin Ke committed
434
435
436
437
        super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
                                    early_stopping_rounds, verbose,
                                    train_fields, valid_fields,
                                    other_params)
Guolin Ke's avatar
Guolin Ke committed
438
        return self