sklearn.py 25.2 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111, C0301
wxchan's avatar
wxchan committed
3
4
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
5
import inspect
wxchan's avatar
wxchan committed
6
7

import numpy as np
Guolin Ke's avatar
Guolin Ke committed
8
from .basic import LightGBMError, Dataset, is_str
wxchan's avatar
wxchan committed
9
from .engine import train
10
'''sklearn'''
wxchan's avatar
wxchan committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
    LGBMClassifierBase = ClassifierMixin
    LGBMLabelEncoder = LabelEncoder
except ImportError:
    SKLEARN_INSTALLED = False
    LGBMModelBase = object
    LGBMClassifierBase = object
    LGBMRegressorBase = object
    LGBMLabelEncoder = None

27
def _objective_function_wrapper(func):
wxchan's avatar
wxchan committed
28
29
30
31
32
33
34
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
35
36
37
38
39
40
41
        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
            y_true: array_like of shape [n_samples]
                The target values
            y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
                The predicted values
            group: array_like
                group/query data, used for ranking task
wxchan's avatar
wxchan committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

        preds: array_like, shape [n_samples] or shape[n_samples* n_class]
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
58
59
60
61
62
63
        argc = len(inspect.getargspec(func).args)
        if argc == 2:
            grad, hess = func(labels, preds)
        elif argc == 3:
            grad, hess = func(labels, preds, dataset.get_group())
        else:
64
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" %(argc))
wxchan's avatar
wxchan committed
65
66
67
68
69
70
71
72
73
74
75
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
76
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
77
78
79
80
81
82
83
84
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

85
86
87
88
89
90
91
def _eval_function_wrapper(func):
    """Decorate an eval function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
    Parameters
    ----------
    func: callable
92
93
94
95
96
        Expects a callable with following functions:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
         or ``func(y_true, y_pred, weight, group)``
            and return (eval_name->str, eval_result->float, is_bigger_better->Bool):
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

            y_true: array_like of shape [n_samples]
                The target values
            y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task

    Returns
    -------
    new_func: callable
        The new eval function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

        preds: array_like, shape [n_samples] or shape[n_samples* n_class]
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
        argc = len(inspect.getargspec(func).args)
        if argc == 2:
            return func(labels, preds)
        elif argc == 3:
            return func(labels, preds, dataset.get_weight())
        elif argc == 4:
            return func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
130
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" %(argc))
131
132
    return inner

wxchan's avatar
wxchan committed
133
134
class LGBMModel(LGBMModelBase):

135
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
wxchan's avatar
wxchan committed
136
137
138
139
140
141
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="regression",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
wxchan's avatar
wxchan committed
142
143
144
145
146
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
147
148
149
        boosting_type : string
            gbdt, traditional Gradient Boosting Decision Tree
            dart, Dropouts meet Multiple Additive Regression Trees
wxchan's avatar
wxchan committed
150
151
152
153
154
155
156
157
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
            Boosting learning rate
        n_estimators : int
            Number of boosted trees to fit.
Guolin Ke's avatar
Guolin Ke committed
158
159
        max_bin : int
            Number of bucketed bin for feature values
wxchan's avatar
wxchan committed
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
        silent : boolean
            Whether to print messages while running boosting.
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            default: binary for LGBMClassifier, lambdarank for LGBMRanker
        nthread : int
            Number of parallel threads
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
            Minimum sum of instance weight(hessian) needed in a child(leaf)
        min_child_samples : int
            Minimum number of data need in a child(leaf)
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
            frequence of subsample, <=0 means no enable
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
            L1 regularization term on weights
        reg_lambda : float
            L2 regularization term on weights
        scale_pos_weight : float
            Balancing of positive and negative weights.
        is_unbalance : bool
            Is unbalance for binary classification
        seed : int
            Random number seed.

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
195
        ``objective(y_true, y_pred) -> grad, hess``
wxchan's avatar
wxchan committed
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
                The target values
            y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
                The predicted values
            group: array_like
                group/query data, used for ranking task
            grad: array_like of shape [n_samples] or shape[n_samples* n_class]
                The value of the gradient for each sample point.
            hess: array_like of shape [n_samples] or shape[n_samples* n_class]
                The value of the second derivative for each sample point

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
wxchan's avatar
wxchan committed
213
        if not SKLEARN_INSTALLED:
214
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
215

216
        self.boosting_type = boosting_type
wxchan's avatar
wxchan committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
        self.silent = silent
        self.objective = objective
        self.nthread = nthread
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
        self._Booster = None
237
        self.best_iteration = -1
wxchan's avatar
wxchan committed
238
        if callable(self.objective):
239
            self.fobj = _objective_function_wrapper(self.objective)
wxchan's avatar
wxchan committed
240
241
242
243
        else:
            self.fobj = None

    def booster(self):
wxchan's avatar
wxchan committed
244
245
        """
        Get the underlying lightgbm Booster of this model.
wxchan's avatar
wxchan committed
246
247
248
249
250
251
252
        This will raise an exception when fit was not called

        Returns
        -------
        booster : a lightgbm booster of underlying model
        """
        if self._Booster is None:
253
            raise LightGBMError('Need to call fit beforehand')
wxchan's avatar
wxchan committed
254
255
256
        return self._Booster

    def get_params(self, deep=False):
wxchan's avatar
wxchan committed
257
258
259
        """
        Get parameters
        """
wxchan's avatar
wxchan committed
260
261
262
263
264
        params = super(LGBMModel, self).get_params(deep=deep)
        if self.nthread <= 0:
            params.pop('nthread', None)
        return params

Guolin Ke's avatar
Guolin Ke committed
265
    def fit(self, X, y,
266
267
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
268
269
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
270
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
271
272
            feature_name=None, categorical_feature=None,
            other_params=None):
wxchan's avatar
wxchan committed
273
274
275
276
277
278
279
280
281
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
282
283
284
285
286
287
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
288
289
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
Guolin Ke's avatar
Guolin Ke committed
290
291
292
293
294
295
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
296
297
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
298
            If callable, a custom evaluation metric, see note for more details.
wxchan's avatar
wxchan committed
299
300
301
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
Guolin Ke's avatar
Guolin Ke committed
302
        feature_name : list of str
303
304
            Feature names
        categorical_feature : list of str or int
wxchan's avatar
wxchan committed
305
306
            Categorical features,
            type int represents index,
307
            type str represents feature names (need to specify feature_name as well)
wxchan's avatar
wxchan committed
308
        other_params: dict
309
            Other parameters
310
311
312

        Note
        ----
wxchan's avatar
wxchan committed
313
314
315
316
317
        Custom eval function expects a callable with following functions:
            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
            return (eval_name, eval_result, is_bigger_better)
                or list of (eval_name, eval_result, is_bigger_better)
318
319
320

            y_true: array_like of shape [n_samples]
                The target values
wxchan's avatar
wxchan committed
321
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
322
323
324
325
326
327
328
329
330
331
332
333
334
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task
            eval_name: str
                name of evaluation
            eval_result: float
                eval result
            is_bigger_better: bool
                is eval result bigger better, e.g. AUC is bigger_better.
        for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
wxchan's avatar
wxchan committed
335
336
337
        """
        evals_result = {}
        params = self.get_params()
338
        params['verbose'] = 0 if self.silent else 1
wxchan's avatar
wxchan committed
339
340
341
342
343
344
345
346
347
348

        if other_params is not None:
            params.update(other_params)

        if self.fobj:
            params["objective"] = "None"
        else:
            params["objective"] = self.objective

        if callable(eval_metric):
349
            feval = _eval_function_wrapper(eval_metric)
wxchan's avatar
wxchan committed
350
351
352
353
354
355
        elif is_str(eval_metric) or isinstance(eval_metric, list):
            feval = None
            params.update({'metric': eval_metric})
        else:
            feval = None

Guolin Ke's avatar
Guolin Ke committed
356
        def _construct_dataset(X, y, sample_weight, init_score, group, params):
Guolin Ke's avatar
Guolin Ke committed
357
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
Guolin Ke's avatar
Guolin Ke committed
358
359
360
            ret.set_init_score(init_score)
            return ret

Guolin Ke's avatar
Guolin Ke committed
361
        train_set = _construct_dataset(X, y, sample_weight, init_score, group, params)
Guolin Ke's avatar
Guolin Ke committed
362
363
364
365
366
367
368
369
370
371
372
373
374

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
                    valid_weight = None if eval_sample_weight is None else eval_sample_weight.get(i, None)
                    valid_init_score = None if eval_init_score is None else eval_init_score.get(i, None)
                    valid_group = None if eval_group is None else eval_group.get(i, None)
Guolin Ke's avatar
Guolin Ke committed
375
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
376
377
378
379
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
                              self.n_estimators, valid_sets=valid_sets,
wxchan's avatar
wxchan committed
380
381
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
382
                              verbose_eval=verbose, feature_name=feature_name,
Guolin Ke's avatar
Guolin Ke committed
383
                              categorical_feature=categorical_feature)
wxchan's avatar
wxchan committed
384
385
386
387
388
389
390
391
392
393
394
395

        if evals_result:
            for val in evals_result.items():
                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
        """
        Return the predicted value for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
wxchan's avatar
wxchan committed
411
412
413
414
415
        return self.booster().predict(data,
                                      raw_score=raw_score,
                                      num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
wxchan's avatar
wxchan committed
416
417
        """
        Return the predicted leaf every tree for each sample.
wxchan's avatar
wxchan committed
418
419
420
421
422
423

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

wxchan's avatar
wxchan committed
424
425
        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).
wxchan's avatar
wxchan committed
426
427
428
429
430
431
432
433
434
435

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
        return self.booster().predict(X,
                                      pred_leaf=True,
                                      num_iteration=num_iteration)

    def evals_result(self):
wxchan's avatar
wxchan committed
436
437
438
        """
        Return the evaluation results.

wxchan's avatar
wxchan committed
439
440
441
442
443
444
445
        Returns
        -------
        evals_result : dictionary
        """
        if self.evals_result_:
            evals_result = self.evals_result_
        else:
446
            raise LightGBMError('No results found.')
wxchan's avatar
wxchan committed
447
448
449

        return evals_result

450
    def feature_importance(self):
wxchan's avatar
wxchan committed
451
452
453
        """
        Feature importances

454
455
456
457
458
459
        Returns
        -------
        Array of normailized feature importances
        """
        importace_array = self._Booster.feature_importance().astype(np.float32)
        return importace_array / importace_array.sum()
wxchan's avatar
wxchan committed
460
461
462

class LGBMRegressor(LGBMModel, LGBMRegressorBase):

Guolin Ke's avatar
Guolin Ke committed
463
464
    def fit(self, X, y,
            sample_weight=None, init_score=None,
465
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
466
            eval_init_score=None,
wxchan's avatar
wxchan committed
467
            eval_metric="l2",
Guolin Ke's avatar
Guolin Ke committed
468
469
470
471
472
            early_stopping_rounds=None, verbose=True,
            feature_name=None, categorical_feature=None,
            other_params=None):

        super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
473
474
475
476
                                       eval_set, eval_sample_weight, eval_init_score, None,
                                       eval_metric, early_stopping_rounds,
                                       verbose, feature_name, categorical_feature,
                                       other_params)
Guolin Ke's avatar
Guolin Ke committed
477
478
        return self

wxchan's avatar
wxchan committed
479
480
class LGBMClassifier(LGBMModel, LGBMClassifierBase):

481
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
482
483
484
485
486
487
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="binary",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
488
        super(LGBMClassifier, self).__init__(boosting_type, num_leaves, max_depth,
Guolin Ke's avatar
Guolin Ke committed
489
                                             learning_rate, n_estimators, max_bin,
490
491
                                             silent, objective, nthread,
                                             min_split_gain, min_child_weight, min_child_samples,
Guolin Ke's avatar
Guolin Ke committed
492
493
494
495
496
497
                                             subsample, subsample_freq, colsample_bytree,
                                             reg_alpha, reg_lambda, scale_pos_weight,
                                             is_unbalance, seed)

    def fit(self, X, y,
            sample_weight=None, init_score=None,
498
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
499
            eval_init_score=None,
wxchan's avatar
wxchan committed
500
            eval_metric="binary_logloss",
wxchan's avatar
wxchan committed
501
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
502
503
            feature_name=None, categorical_feature=None,
            other_params=None):
wxchan's avatar
wxchan committed
504
505
506
507
508
509
510
511
512

        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        if other_params is None:
            other_params = {}
        if self.n_classes_ > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
            other_params['num_class'] = self.n_classes_
wxchan's avatar
wxchan committed
513
514
            if eval_set is not None and eval_metric == "binary_logloss":
                eval_metric = "multi_logloss"
wxchan's avatar
wxchan committed
515
516
517
518
519
520
521

        self._le = LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)

        if eval_set is not None:
            eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)

Guolin Ke's avatar
Guolin Ke committed
522
523
        super(LGBMClassifier, self).fit(X, training_labels, sample_weight, init_score, None,
                                        eval_set, eval_sample_weight, eval_init_score, None,
wxchan's avatar
wxchan committed
524
                                        eval_metric, early_stopping_rounds,
Guolin Ke's avatar
Guolin Ke committed
525
                                        verbose, feature_name, categorical_feature,
wxchan's avatar
wxchan committed
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
                                        other_params)
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
            column_indexes = np.repeat(0, class_probs.shape[0])
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

    def predict_proba(self, data, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
        """
        Return the predicted probability for each class for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
wxchan's avatar
wxchan committed
556
557
558
559
560
561
562
563
564
565
566
567
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
        if self.n_classes_ > 2:
            return class_probs
        else:
            classone_probs = class_probs
            classzero_probs = 1.0 - classone_probs
            return np.vstack((classzero_probs, classone_probs)).transpose()

class LGBMRanker(LGBMModel):

568
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
569
570
571
572
573
574
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="lambdarank",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
575
        super(LGBMRanker, self).__init__(boosting_type, num_leaves, max_depth,
576
577
578
579
580
581
                                         learning_rate, n_estimators, max_bin,
                                         silent, objective, nthread,
                                         min_split_gain, min_child_weight, min_child_samples,
                                         subsample, subsample_freq, colsample_bytree,
                                         reg_alpha, reg_lambda, scale_pos_weight,
                                         is_unbalance, seed)
Guolin Ke's avatar
Guolin Ke committed
582
583

    def fit(self, X, y,
584
585
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
586
            eval_init_score=None, eval_group=None,
wxchan's avatar
wxchan committed
587
            eval_metric='ndcg', eval_at=1,
wxchan's avatar
wxchan committed
588
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
589
590
591
            feature_name=None, categorical_feature=None,
            other_params=None):
        """
wxchan's avatar
wxchan committed
592
        Most arguments like common methods except following:
Guolin Ke's avatar
Guolin Ke committed
593
594
595
596

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
597
598

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
599
        if group is None:
600
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
601
602

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
603
            if eval_group is None:
604
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
605
            elif len(eval_group) != len(eval_set):
606
                raise ValueError("Length of eval_group should equal to eval_set")
wxchan's avatar
wxchan committed
607
            else:
Guolin Ke's avatar
Guolin Ke committed
608
609
                for inner_group in eval_group:
                    if inner_group is None:
610
611
                        raise ValueError("Should set group for all eval dataset for ranking task")

Guolin Ke's avatar
Guolin Ke committed
612
613
        if eval_at is not None:
            other_params = {} if other_params is None else other_params
wxchan's avatar
wxchan committed
614
615
            if isinstance(eval_at, int):
                eval_at = [eval_at]
Guolin Ke's avatar
Guolin Ke committed
616
617
618
619
620
            other_params['ndcg_eval_at'] = list(eval_at)
        super(LGBMRanker, self).fit(X, y, sample_weight, init_score, group,
                                    eval_set, eval_sample_weight, eval_init_score, eval_group,
                                    eval_metric, early_stopping_rounds,
                                    verbose, feature_name, categorical_feature,
wxchan's avatar
wxchan committed
621
622
                                    other_params)
        return self