sklearn.py 32 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111, C0301
wxchan's avatar
wxchan committed
3
4
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
5

6
import inspect
wxchan's avatar
wxchan committed
7
8

import numpy as np
9
10

from .basic import IS_PY3, Dataset, LightGBMError
wxchan's avatar
wxchan committed
11
from .engine import train
12

13
'''sklearn'''
wxchan's avatar
wxchan committed
14
15
16
17
try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
18
    from sklearn.utils import deprecated
wxchan's avatar
wxchan committed
19
20
21
22
23
24
25
26
27
28
29
30
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
    LGBMClassifierBase = ClassifierMixin
    LGBMLabelEncoder = LabelEncoder
except ImportError:
    SKLEARN_INSTALLED = False
    LGBMModelBase = object
    LGBMClassifierBase = object
    LGBMRegressorBase = object
    LGBMLabelEncoder = None

wxchan's avatar
wxchan committed
31

wxchan's avatar
wxchan committed
32
33
34
35
36
37
38
def _argc(func):
    if IS_PY3:
        return len(inspect.signature(func).parameters)
    else:
        return len(inspect.getargspec(func).args)


39
def _objective_function_wrapper(func):
wxchan's avatar
wxchan committed
40
41
42
43
44
45
46
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
47
48
49
        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
            y_true: array_like of shape [n_samples]
                The target values
50
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
51
52
53
                The predicted values
            group: array_like
                group/query data, used for ranking task
wxchan's avatar
wxchan committed
54
55
56
57
58
59
60

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

61
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
62
63
64
65
66
67
68
69
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
70
        argc = _argc(func)
71
72
73
74
75
        if argc == 2:
            grad, hess = func(labels, preds)
        elif argc == 3:
            grad, hess = func(labels, preds, dataset.get_group())
        else:
wxchan's avatar
wxchan committed
76
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
wxchan's avatar
wxchan committed
77
78
79
80
81
82
83
84
85
86
87
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
88
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
89
90
91
92
93
94
95
96
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

wxchan's avatar
wxchan committed
97

98
99
100
101
102
103
104
def _eval_function_wrapper(func):
    """Decorate an eval function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
    Parameters
    ----------
    func: callable
105
106
107
108
109
        Expects a callable with following functions:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
         or ``func(y_true, y_pred, weight, group)``
            and return (eval_name->str, eval_result->float, is_bigger_better->Bool):
110
111
112

            y_true: array_like of shape [n_samples]
                The target values
113
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
114
115
116
117
118
119
120
121
122
123
124
125
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task

    Returns
    -------
    new_func: callable
        The new eval function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

126
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
127
128
129
130
131
132
133
134
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
135
        argc = _argc(func)
136
137
138
139
140
141
142
        if argc == 2:
            return func(labels, preds)
        elif argc == 3:
            return func(labels, preds, dataset.get_weight())
        elif argc == 4:
            return func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
wxchan's avatar
wxchan committed
143
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
144
145
    return inner

wxchan's avatar
wxchan committed
146

wxchan's avatar
wxchan committed
147
148
class LGBMModel(LGBMModelBase):

149
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
wxchan's avatar
wxchan committed
150
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
151
152
                 subsample_for_bin=50000, objective="regression",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
wxchan's avatar
wxchan committed
153
154
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
wxchan's avatar
wxchan committed
155
                 is_unbalance=False, seed=0, nthread=-1, silent=True,
156
157
                 sigmoid=1.0, huber_delta=1.0, gaussian_eta=1.0, fair_c=1.0,
                 max_position=20, label_gain=None,
158
159
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
wxchan's avatar
wxchan committed
160
161
162
163
164
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
165
166
167
        boosting_type : string
            gbdt, traditional Gradient Boosting Decision Tree
            dart, Dropouts meet Multiple Additive Regression Trees
wxchan's avatar
wxchan committed
168
169
170
171
172
173
174
175
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
            Boosting learning rate
        n_estimators : int
            Number of boosted trees to fit.
Guolin Ke's avatar
Guolin Ke committed
176
177
        max_bin : int
            Number of bucketed bin for feature values
wxchan's avatar
wxchan committed
178
179
        subsample_for_bin : int
            Number of samples for constructing bins.
wxchan's avatar
wxchan committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            default: binary for LGBMClassifier, lambdarank for LGBMRanker
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
            Minimum sum of instance weight(hessian) needed in a child(leaf)
        min_child_samples : int
            Minimum number of data need in a child(leaf)
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
            frequence of subsample, <=0 means no enable
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
            L1 regularization term on weights
        reg_lambda : float
            L2 regularization term on weights
        scale_pos_weight : float
            Balancing of positive and negative weights.
        is_unbalance : bool
            Is unbalance for binary classification
        seed : int
            Random number seed.
wxchan's avatar
wxchan committed
206
207
208
209
210
211
        nthread : int
            Number of parallel threads
        silent : boolean
            Whether to print messages while running boosting.
        sigmoid : float
            Only used in binary classification and lambdarank. Parameter for sigmoid function.
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
212
213
        huber_delta : float
            Only used in regression. Parameter for Huber loss function.
214
215
216
        gaussian_eta : float
            Only used in regression. Parameter for L1 and Huber loss function.
            It is used to control the width of Gaussian function to approximate hessian.
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
217
218
        fair_c : float
            Only used in regression. Parameter for Fair loss function.
wxchan's avatar
wxchan committed
219
220
221
222
223
224
        max_position : int
            Only used in lambdarank, will optimize NDCG at this position.
        label_gain : list of float
            Only used in lambdarank, relevant gain for labels.
            For example, the gain of label 2 is 3 if using default label gains.
            None (default) means use default value of CLI version: {0,1,3,7,15,31,63,...}.
225
226
227
228
229
230
231
232
233
234
        drop_rate : float
            Only used when boosting_type='dart'. Probablity to select dropping trees.
        skip_drop : float
            Only used when boosting_type='dart'. Probablity to skip dropping trees.
        max_drop : int
            Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
        uniform_drop : bool
            Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
        xgboost_dart_mode : bool
            Only used when boosting_type='dart'. Whether use xgboost dart mode.
wxchan's avatar
wxchan committed
235
236
237
238
239

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
240
        ``objective(y_true, y_pred) -> grad, hess``
wxchan's avatar
wxchan committed
241
242
243
244
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
                The target values
245
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
246
247
248
                The predicted values
            group: array_like
                group/query data, used for ranking task
249
            grad: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
250
                The value of the gradient for each sample point.
251
            hess: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
252
253
254
255
256
257
                The value of the second derivative for each sample point

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
wxchan's avatar
wxchan committed
258
        if not SKLEARN_INSTALLED:
259
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
260

261
        self.boosting_type = boosting_type
wxchan's avatar
wxchan committed
262
263
264
265
266
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
wxchan's avatar
wxchan committed
267
        self.subsample_for_bin = subsample_for_bin
wxchan's avatar
wxchan committed
268
269
270
271
272
273
274
275
276
277
278
279
        self.objective = objective
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
wxchan's avatar
wxchan committed
280
281
282
        self.nthread = nthread
        self.silent = silent
        self.sigmoid = sigmoid
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
283
        self.huber_delta = huber_delta
284
        self.gaussian_eta = gaussian_eta
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
285
        self.fair_c = fair_c
wxchan's avatar
wxchan committed
286
287
        self.max_position = max_position
        self.label_gain = label_gain
288
289
290
291
292
        self.drop_rate = drop_rate
        self.skip_drop = skip_drop
        self.max_drop = max_drop
        self.uniform_drop = uniform_drop
        self.xgboost_dart_mode = xgboost_dart_mode
wxchan's avatar
wxchan committed
293
        self._Booster = None
294
        self.evals_result = None
295
        self.best_iteration = -1
wxchan's avatar
wxchan committed
296
        if callable(self.objective):
297
            self.fobj = _objective_function_wrapper(self.objective)
wxchan's avatar
wxchan committed
298
299
300
        else:
            self.fobj = None

Guolin Ke's avatar
Guolin Ke committed
301
    def fit(self, X, y,
302
303
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
304
305
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
306
            early_stopping_rounds=None, verbose=True,
307
308
            feature_name=None, categorical_feature=None,
            callbacks=None):
wxchan's avatar
wxchan committed
309
310
311
312
313
314
315
316
317
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
318
319
320
321
322
323
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
324
325
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
Guolin Ke's avatar
Guolin Ke committed
326
327
328
329
330
331
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
332
333
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
334
            If callable, a custom evaluation metric, see note for more details.
wxchan's avatar
wxchan committed
335
336
337
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
Guolin Ke's avatar
Guolin Ke committed
338
        feature_name : list of str
339
340
            Feature names
        categorical_feature : list of str or int
wxchan's avatar
wxchan committed
341
342
            Categorical features,
            type int represents index,
343
            type str represents feature names (need to specify feature_name as well)
344
345
346
        callbacks : list of callback functions
            List of callback functions that are applied at each iteration.
            See Callbacks in Python-API.md for more information.
347
348
349

        Note
        ----
wxchan's avatar
wxchan committed
350
351
352
353
354
        Custom eval function expects a callable with following functions:
            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
            return (eval_name, eval_result, is_bigger_better)
                or list of (eval_name, eval_result, is_bigger_better)
355
356
357

            y_true: array_like of shape [n_samples]
                The target values
wxchan's avatar
wxchan committed
358
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
359
360
361
362
363
364
365
366
367
368
369
370
371
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task
            eval_name: str
                name of evaluation
            eval_result: float
                eval result
            is_bigger_better: bool
                is eval result bigger better, e.g. AUC is bigger_better.
        for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
wxchan's avatar
wxchan committed
372
373
374
        """
        evals_result = {}
        params = self.get_params()
375
376
377
378
379
        params['verbose'] = -1 if self.silent else 1
        if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
            params['num_class'] = self.n_classes_
        if hasattr(self, 'eval_at'):
            params['ndcg_eval_at'] = self.eval_at
wxchan's avatar
wxchan committed
380
        if self.fobj:
wxchan's avatar
wxchan committed
381
            params['objective'] = 'None'  # objective = nullptr for unknown objective
wxchan's avatar
wxchan committed
382
        if 'label_gain' in params and params['label_gain'] is None:
wxchan's avatar
wxchan committed
383
            del params['label_gain']  # use default of cli version
wxchan's avatar
wxchan committed
384
385

        if callable(eval_metric):
386
            feval = _eval_function_wrapper(eval_metric)
wxchan's avatar
wxchan committed
387
388
        else:
            feval = None
389
            params['metric'] = eval_metric
wxchan's avatar
wxchan committed
390

Guolin Ke's avatar
Guolin Ke committed
391
        def _construct_dataset(X, y, sample_weight, init_score, group, params):
Guolin Ke's avatar
Guolin Ke committed
392
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
Guolin Ke's avatar
Guolin Ke committed
393
394
395
            ret.set_init_score(init_score)
            return ret

Guolin Ke's avatar
Guolin Ke committed
396
        train_set = _construct_dataset(X, y, sample_weight, init_score, group, params)
Guolin Ke's avatar
Guolin Ke committed
397
398
399
400
401
402
403
404
405
406

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
407
408
409
410
                    def get_meta_data(collection, i):
                        if collection is None:
                            return None
                        elif isinstance(collection, list):
411
                            return collection[i] if len(collection) > i else None
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
412
413
414
415
416
417
418
                        elif isinstance(collection, dict):
                            return collection.get(i, None)
                        else:
                            raise TypeError('eval_sample_weight, eval_init_score, and eval_group should be dict or list')
                    valid_weight = get_meta_data(eval_sample_weight, i)
                    valid_init_score = get_meta_data(eval_init_score, i)
                    valid_group = get_meta_data(eval_group, i)
Guolin Ke's avatar
Guolin Ke committed
419
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
420
421
422
423
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
                              self.n_estimators, valid_sets=valid_sets,
wxchan's avatar
wxchan committed
424
425
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
426
                              verbose_eval=verbose, feature_name=feature_name,
427
428
                              categorical_feature=categorical_feature,
                              callbacks=callbacks)
wxchan's avatar
wxchan committed
429
430

        if evals_result:
431
            self.evals_result = evals_result
wxchan's avatar
wxchan committed
432
433
434
435
436

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

437
    def predict(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
        """
        Return the predicted value for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
453
        return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
454
455

    def apply(self, X, num_iteration=0):
wxchan's avatar
wxchan committed
456
457
        """
        Return the predicted leaf every tree for each sample.
wxchan's avatar
wxchan committed
458
459
460
461
462
463

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

wxchan's avatar
wxchan committed
464
465
        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).
wxchan's avatar
wxchan committed
466
467
468
469
470

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
471
        return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
472

473
474
475
476
477
478
    @property
    def booster_(self):
        """Get the underlying lightgbm Booster of this model."""
        if self._Booster is None:
            raise LightGBMError('No booster found. Need to call fit beforehand.')
        return self._Booster
wxchan's avatar
wxchan committed
479

480
481
482
483
484
485
486
487
488
489
490
491
    @property
    def evals_result_(self):
        """Get the evaluation results."""
        if self.evals_result is None:
            raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
        return self.evals_result

    @property
    def feature_importance_(self):
        """Get normailized feature importances."""
        importace_array = self.booster_.feature_importance().astype(np.float32)
        return importace_array / importace_array.sum()
wxchan's avatar
wxchan committed
492

493
494
495
    @deprecated('Use attribute booster_ instead.')
    def booster(self):
        return self.booster_
wxchan's avatar
wxchan committed
496

497
    @deprecated('Use attribute feature_importance_ instead.')
498
    def feature_importance(self):
499
        return self.feature_importance_
wxchan's avatar
wxchan committed
500

wxchan's avatar
wxchan committed
501

wxchan's avatar
wxchan committed
502
503
class LGBMRegressor(LGBMModel, LGBMRegressorBase):

Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
504
505
506
507
508
509
510
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 subsample_for_bin=50000, objective="regression",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0,
                 seed=0, nthread=-1, silent=True,
511
                 huber_delta=1.0, gaussian_eta=1.0, fair_c=1.0,
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
512
513
514
515
516
517
518
519
520
521
522
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
        super(LGBMRegressor, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                            max_depth=max_depth, learning_rate=learning_rate,
                                            n_estimators=n_estimators, max_bin=max_bin,
                                            subsample_for_bin=subsample_for_bin, objective=objective,
                                            min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                            min_child_samples=min_child_samples, subsample=subsample,
                                            subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                            reg_alpha=reg_alpha, reg_lambda=reg_lambda,
                                            seed=seed, nthread=nthread, silent=silent,
523
                                            huber_delta=huber_delta, gaussian_eta=gaussian_eta, fair_c=fair_c,
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
524
525
526
                                            drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
                                            uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)

Guolin Ke's avatar
Guolin Ke committed
527
528
    def fit(self, X, y,
            sample_weight=None, init_score=None,
529
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
530
            eval_init_score=None,
wxchan's avatar
wxchan committed
531
            eval_metric="l2",
Guolin Ke's avatar
Guolin Ke committed
532
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
533
            feature_name=None, categorical_feature=None, callbacks=None):
534
535
536
537
538
539
540
541

        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
                                       init_score=init_score, eval_set=eval_set,
                                       eval_sample_weight=eval_sample_weight,
                                       eval_init_score=eval_init_score,
                                       eval_metric=eval_metric,
                                       early_stopping_rounds=early_stopping_rounds,
                                       verbose=verbose, feature_name=feature_name,
Guolin Ke's avatar
Guolin Ke committed
542
543
                                       categorical_feature=categorical_feature,
                                       callbacks=callbacks)
Guolin Ke's avatar
Guolin Ke committed
544
545
        return self

wxchan's avatar
wxchan committed
546

wxchan's avatar
wxchan committed
547
548
class LGBMClassifier(LGBMModel, LGBMClassifierBase):

549
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
550
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
551
552
                 subsample_for_bin=50000, objective="binary",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
Guolin Ke's avatar
Guolin Ke committed
553
554
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
wxchan's avatar
wxchan committed
555
556
                 is_unbalance=False, seed=0, nthread=-1,
                 silent=True, sigmoid=1.0,
557
558
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
559
        self.classes, self.n_classes = None, None
560
561
562
        super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                             max_depth=max_depth, learning_rate=learning_rate,
                                             n_estimators=n_estimators, max_bin=max_bin,
wxchan's avatar
wxchan committed
563
                                             subsample_for_bin=subsample_for_bin, objective=objective,
564
565
566
567
                                             min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                             min_child_samples=min_child_samples, subsample=subsample,
                                             subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                             reg_alpha=reg_alpha, reg_lambda=reg_lambda,
wxchan's avatar
wxchan committed
568
569
                                             scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance,
                                             seed=seed, nthread=nthread, silent=silent, sigmoid=sigmoid,
570
571
                                             drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
                                             uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
Guolin Ke's avatar
Guolin Ke committed
572
573
574

    def fit(self, X, y,
            sample_weight=None, init_score=None,
575
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
576
            eval_init_score=None,
wxchan's avatar
wxchan committed
577
            eval_metric="binary_logloss",
wxchan's avatar
wxchan committed
578
            early_stopping_rounds=None, verbose=True,
579
580
            feature_name=None, categorical_feature=None,
            callbacks=None):
581
582
583
        self._le = LGBMLabelEncoder().fit(y)
        y = self._le.transform(y)

584
585
586
        self.classes = self._le.classes_
        self.n_classes = len(self.classes_)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
587
588
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
wxchan's avatar
wxchan committed
589
590
            if eval_set is not None and eval_metric == "binary_logloss":
                eval_metric = "multi_logloss"
wxchan's avatar
wxchan committed
591
592

        if eval_set is not None:
593
594
595
596
597
598
599
600
601
            eval_set = [(x[0], self._le.transform(x[1])) for x in eval_set]

        super(LGBMClassifier, self).fit(X, y, sample_weight=sample_weight,
                                        init_score=init_score, eval_set=eval_set,
                                        eval_sample_weight=eval_sample_weight,
                                        eval_init_score=eval_init_score,
                                        eval_metric=eval_metric,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=verbose, feature_name=feature_name,
602
603
                                        categorical_feature=categorical_feature,
                                        callbacks=callbacks)
wxchan's avatar
wxchan committed
604
605
        return self

606
607
608
609
    def predict(self, X, raw_score=False, num_iteration=0):
        class_probs = self.predict_proba(X, raw_score, num_iteration)
        class_index = np.argmax(class_probs, axis=1)
        return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
610

611
    def predict_proba(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
        """
        Return the predicted probability for each class for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
627
628
        class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
629
630
            return class_probs
        else:
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
            return np.vstack((1. - class_probs, class_probs)).transpose()

    @property
    def classes_(self):
        """Get class label array."""
        if self.classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.classes

    @property
    def n_classes_(self):
        """Get number of classes"""
        if self.n_classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.n_classes
wxchan's avatar
wxchan committed
646

wxchan's avatar
wxchan committed
647

wxchan's avatar
wxchan committed
648
649
class LGBMRanker(LGBMModel):

650
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
651
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
652
653
                 subsample_for_bin=50000, objective="lambdarank",
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
Guolin Ke's avatar
Guolin Ke committed
654
655
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
wxchan's avatar
wxchan committed
656
657
                 is_unbalance=False, seed=0, nthread=-1, silent=True,
                 sigmoid=1.0, max_position=20, label_gain=None,
658
659
660
661
662
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
        super(LGBMRanker, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                         max_depth=max_depth, learning_rate=learning_rate,
                                         n_estimators=n_estimators, max_bin=max_bin,
wxchan's avatar
wxchan committed
663
                                         subsample_for_bin=subsample_for_bin, objective=objective,
664
665
666
667
                                         min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                         min_child_samples=min_child_samples, subsample=subsample,
                                         subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                         reg_alpha=reg_alpha, reg_lambda=reg_lambda,
wxchan's avatar
wxchan committed
668
669
670
                                         scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance,
                                         seed=seed, nthread=nthread, silent=silent,
                                         sigmoid=sigmoid, max_position=max_position, label_gain=label_gain,
671
672
                                         drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
                                         uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
Guolin Ke's avatar
Guolin Ke committed
673
674

    def fit(self, X, y,
675
676
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
677
            eval_init_score=None, eval_group=None,
wxchan's avatar
wxchan committed
678
            eval_metric='ndcg', eval_at=1,
wxchan's avatar
wxchan committed
679
            early_stopping_rounds=None, verbose=True,
680
681
            feature_name=None, categorical_feature=None,
            callbacks=None):
Guolin Ke's avatar
Guolin Ke committed
682
        """
wxchan's avatar
wxchan committed
683
        Most arguments like common methods except following:
Guolin Ke's avatar
Guolin Ke committed
684
685
686
687

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
688
689

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
690
        if group is None:
691
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
692
693

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
694
            if eval_group is None:
695
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
696
            elif len(eval_group) != len(eval_set):
697
                raise ValueError("Length of eval_group should equal to eval_set")
698
            elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))) \
wxchan's avatar
wxchan committed
699
                    or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
700
                raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
701

Guolin Ke's avatar
Guolin Ke committed
702
        if eval_at is not None:
703
704
705
706
707
708
709
710
            self.eval_at = eval_at
        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
                                    init_score=init_score, group=group,
                                    eval_set=eval_set, eval_sample_weight=eval_sample_weight,
                                    eval_init_score=eval_init_score, eval_group=eval_group,
                                    eval_metric=eval_metric,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=verbose, feature_name=feature_name,
711
712
                                    categorical_feature=categorical_feature,
                                    callbacks=callbacks)
wxchan's avatar
wxchan committed
713
        return self