sklearn.py 22.4 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111
wxchan's avatar
wxchan committed
3
4
5
6
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import

import numpy as np
Guolin Ke's avatar
Guolin Ke committed
7
from .basic import LightGBMError, Dataset, is_str
wxchan's avatar
wxchan committed
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from .engine import train
# sklearn
try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
    LGBMClassifierBase = ClassifierMixin
    LGBMLabelEncoder = LabelEncoder
except ImportError:
    SKLEARN_INSTALLED = False
    LGBMModelBase = object
    LGBMClassifierBase = object
    LGBMRegressorBase = object
    LGBMLabelEncoder = None

def _point_wise_objective(func):
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
        Expects a callable with signature ``func(y_true, y_pred)``:

        y_true: array_like of shape [n_samples]
            The target values
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values


    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

        preds: array_like, shape [n_samples] or shape[n_samples* n_class]
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
        grad, hess = func(labels, preds)
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
69
                    raise ValueError("length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

class LGBMModel(LGBMModelBase):
    """Implementation of the Scikit-Learn API for LightGBM.

    Parameters
    ----------
    num_leaves : int
        Maximum tree leaves for base learners.
    max_depth : int
        Maximum tree depth for base learners, -1 means no limit.
    learning_rate : float
        Boosting learning rate
    n_estimators : int
        Number of boosted trees to fit.
    silent : boolean
        Whether to print messages while running boosting.
    objective : string or callable
        Specify the learning task and the corresponding learning objective or
        a custom objective function to be used (see note below).
    nthread : int
        Number of parallel threads
    min_split_gain : float
        Minimum loss reduction required to make a further partition on a leaf node of the tree.
    min_child_weight : int
        Minimum sum of instance weight(hessian) needed in a child(leaf)
    min_child_samples : int
        Minimum number of data need in a child(leaf)
    subsample : float
        Subsample ratio of the training instance.
    subsample_freq : int
        frequence of subsample, <=0 means no enable
    colsample_bytree : float
        Subsample ratio of columns when constructing each tree.
    reg_alpha : float
        L1 regularization term on weights
    reg_lambda : float
        L2 regularization term on weights
    scale_pos_weight : float
        Balancing of positive and negative weights.
    is_unbalance : bool
        Is unbalance for binary classification
    seed : int
        Random number seed.

    Note
    ----
    A custom objective function can be provided for the ``objective``
    parameter. In this case, it should have the signature
    ``objective(y_true, y_pred) -> grad, hess``:

    y_true: array_like of shape [n_samples]
        The target values
    y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
        The predicted values

    grad: array_like of shape [n_samples] or shape[n_samples* n_class]
        The value of the gradient for each sample point.
    hess: array_like of shape [n_samples] or shape[n_samples* n_class]
        The value of the second derivative for each sample point

    for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    """

    def __init__(self, num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="regression",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
            raise LightGBMError('sklearn needs to be installed in order to use this module')

        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
        self.silent = silent
        self.objective = objective
        self.nthread = nthread
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
        self._Booster = None
172
        self.best_iteration = -1
wxchan's avatar
wxchan committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
        if callable(self.objective):
            self.fobj = _point_wise_objective(self.objective)
        else:
            self.fobj = None

    def booster(self):
        """Get the underlying lightgbm Booster of this model.

        This will raise an exception when fit was not called

        Returns
        -------
        booster : a lightgbm booster of underlying model
        """
        if self._Booster is None:
            raise LightGBMError('need to call fit beforehand')
        return self._Booster

    def get_params(self, deep=False):
        """Get parameters"""
        params = super(LGBMModel, self).get_params(deep=deep)
        if self.nthread <= 0:
            params.pop('nthread', None)
        return params

Guolin Ke's avatar
Guolin Ke committed
198
199
200
201
202
    def fit(self, X, y,
            sample_weight=None, init_score=None, group=None, 
            eval_set=None, eval_sample_weight=None, 
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
203
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
204
205
            feature_name=None, categorical_feature=None,
            other_params=None):
wxchan's avatar
wxchan committed
206
207
208
209
210
211
212
213
214
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
215
216
217
218
219
220
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
221
222
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
Guolin Ke's avatar
Guolin Ke committed
223
224
225
226
227
228
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
229
230
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
231
232
            If callable, a custom evaluation metric. The call \
            signature is func(y_predicted, dataset) where dataset will be a \
Guolin Ke's avatar
Guolin Ke committed
233
            Dateset object such that you may need to call the get_label \
wxchan's avatar
wxchan committed
234
235
236
237
            method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
Guolin Ke's avatar
Guolin Ke committed
238
        feature_name : list of str
239
240
241
242
            Feature names
        categorical_feature : list of str or int
            Categorical features, type int represents index, \
            type str represents feature names (need to specify feature_name as well)
wxchan's avatar
wxchan committed
243
        other_params: dict
244
            Other parameters
wxchan's avatar
wxchan committed
245
246
247
        """
        evals_result = {}
        params = self.get_params()
248
        params['verbose'] = 0 if self.silent else 1
wxchan's avatar
wxchan committed
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

        if other_params is not None:
            params.update(other_params)

        if self.fobj:
            params["objective"] = "None"
        else:
            params["objective"] = self.objective
            if eval_metric is None and eval_set is not None:
                eval_metric = {
                    'regression': 'l2',
                    'binary': 'binary_logloss',
                    'lambdarank': 'ndcg',
                    'multiclass': 'multi_logloss'
                }.get(self.objective, None)

        if callable(eval_metric):
            feval = eval_metric
        elif is_str(eval_metric) or isinstance(eval_metric, list):
            feval = None
            params.update({'metric': eval_metric})
        else:
            feval = None
        feval = eval_metric if callable(eval_metric) else None

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
        def _construct_dataset(X, y, sample_weight, init_score, group):
            ret = Dataset(X, label=y, weight=sample_weight, group=group)
            ret.set_init_score(init_score)
            return ret

        train_set = _construct_dataset(X, y, sample_weight, init_score, group)

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
                    valid_weight = None if eval_sample_weight is None else eval_sample_weight.get(i, None)
                    valid_init_score = None if eval_init_score is None else eval_init_score.get(i, None)
                    valid_group = None if eval_group is None else eval_group.get(i, None)
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group)
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
                              self.n_estimators, valid_sets=valid_sets,
wxchan's avatar
wxchan committed
298
299
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
300
                              verbose_eval=verbose, feature_name=feature_name,
Guolin Ke's avatar
Guolin Ke committed
301
                              categorical_feature=categorical_feature)
wxchan's avatar
wxchan committed
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        return self.booster().predict(data,
                                      raw_score=raw_score,
                                      num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
        """Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        ntree_limit : int
            Limit number of trees in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
        return self.booster().predict(X,
                                      pred_leaf=True,
                                      num_iteration=num_iteration)

    def evals_result(self):
        """Return the evaluation results.
        Returns
        -------
        evals_result : dictionary
        """
        if self.evals_result_:
            evals_result = self.evals_result_
        else:
            raise LightGBMError('No results.')

        return evals_result

350
351
352
353
354
355
356
357
    def feature_importance(self):
        """Feature importances
        Returns
        -------
        Array of normailized feature importances
        """
        importace_array = self._Booster.feature_importance().astype(np.float32)
        return importace_array / importace_array.sum()
wxchan's avatar
wxchan committed
358
359
360
361
362

class LGBMRegressor(LGBMModel, LGBMRegressorBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

Guolin Ke's avatar
Guolin Ke committed
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
    def fit(self, X, y,
            sample_weight=None, init_score=None,
            eval_set=None, eval_sample_weight=None, 
            eval_init_score=None,
            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
            feature_name=None, categorical_feature=None,
            other_params=None):

        super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
                                    eval_set, eval_sample_weight, eval_init_score, None,
                                    eval_metric, early_stopping_rounds,
                                    verbose, feature_name, categorical_feature,
                                    other_params)
        return self

wxchan's avatar
wxchan committed
379
380
381
382
383
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM classification.

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

Guolin Ke's avatar
Guolin Ke committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
    def __init__(self, num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="binary",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        super(LGBMClassifier, self).__init__(num_leaves, max_depth,
                                             learning_rate, n_estimators, max_bin,
                                             silent, objective,
                                             nthread, min_split_gain, min_child_weight, min_child_samples,
                                             subsample, subsample_freq, colsample_bytree,
                                             reg_alpha, reg_lambda, scale_pos_weight,
                                             is_unbalance, seed)

    def fit(self, X, y,
            sample_weight=None, init_score=None,
            eval_set=None, eval_sample_weight=None, 
            eval_init_score=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
404
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
405
406
            feature_name=None, categorical_feature=None,
            other_params=None):
wxchan's avatar
wxchan committed
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422

        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        if other_params is None:
            other_params = {}
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
            other_params['num_class'] = self.n_classes_

        self._le = LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)

        if eval_set is not None:
            eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)

Guolin Ke's avatar
Guolin Ke committed
423
424
        super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None,
                                        eval_set, eval_sample_weight, eval_init_score, None,
wxchan's avatar
wxchan committed
425
                                        eval_metric, early_stopping_rounds,
Guolin Ke's avatar
Guolin Ke committed
426
                                        verbose, feature_name, categorical_feature,
wxchan's avatar
wxchan committed
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
                                        other_params)
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

    def predict_proba(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if self.n_classes_ > 2:
            return class_probs
        else:
            classone_probs = class_probs
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs, classone_probs)).transpose()


def _group_wise_objective(func):
    """Decorate an objective function
    Parameters
    ----------
    func: callable
        Expects a callable with signature ``func(y_true, group, y_pred)``:

        y_true: array_like of shape [n_samples]
            The target values
        group : array_like of shape
463
            Group size data of data
wxchan's avatar
wxchan committed
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values
    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

        preds: array_like, shape [n_samples] or shape[n_samples* n_class]
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
        group = dataset.get_group()
        if group is None:
            raise ValueError("group should not be None for ranking task")
        grad, hess = func(labels, group, preds)
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                raise ValueError("lenght of grad and hess should equal with num_data")
        return grad, hess
    return inner

class LGBMRanker(LGBMModel):
    __doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.

    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

Guolin Ke's avatar
Guolin Ke committed
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
    def __init__(self, num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="lambdarank",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        super(LGBMRanker, self).__init__(num_leaves, max_depth,
                                             learning_rate, n_estimators, max_bin,
                                             silent, objective,
                                             nthread, min_split_gain, min_child_weight, min_child_samples,
                                             subsample, subsample_freq, colsample_bytree,
                                             reg_alpha, reg_lambda, scale_pos_weight,
                                             is_unbalance, seed)
        if callable(self.objective):
            self.fobj = _group_wise_objective(self.objective)
        else:
            self.fobj = None

    def fit(self, X, y,
            sample_weight=None, init_score=None, group=None, 
            eval_set=None, eval_sample_weight=None, 
            eval_init_score=None, eval_group=None,
            eval_metric=None, eval_at=None,
wxchan's avatar
wxchan committed
526
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
527
528
529
530
531
532
533
534
            feature_name=None, categorical_feature=None,
            other_params=None):
        """
        Most arguments like LGBMModel.fit except following:

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
535
536

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
537
538
        if group is None:
            raise ValueError("should use group for ranking task")
wxchan's avatar
wxchan committed
539
540

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
541
542
543
544
            if eval_group is None:
                raise ValueError("eval_group cannot be None when eval_set is not None")
            elif len(eval_group) != len(eval_set):
                raise ValueError("length of eval_group should equal with eval_set")
wxchan's avatar
wxchan committed
545
            else:
Guolin Ke's avatar
Guolin Ke committed
546
547
548
549
550
551
552
553
554
555
                for inner_group in eval_group:
                    if inner_group is None:
                        raise ValueError("should set group for all eval data for ranking task")
        if eval_at is not None:
            other_params = {} if other_params is None else other_params
            other_params['ndcg_eval_at'] = list(eval_at)
        super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group,
                                    eval_set, eval_sample_weight, eval_init_score, eval_group,
                                    eval_metric, early_stopping_rounds,
                                    verbose, feature_name, categorical_feature,
wxchan's avatar
wxchan committed
556
557
                                    other_params)
        return self