"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "546056e7673f2f81b2c72bd4c823a8f031561b55"
sklearn.py 28.2 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111, C0301
wxchan's avatar
wxchan committed
3
4
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
5
import inspect
wxchan's avatar
wxchan committed
6
7

import numpy as np
8
from .basic import LightGBMError, Dataset
wxchan's avatar
wxchan committed
9
from .engine import train
10
'''sklearn'''
wxchan's avatar
wxchan committed
11
12
13
14
try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
15
    from sklearn.utils import deprecated
wxchan's avatar
wxchan committed
16
17
18
19
20
21
22
23
24
25
26
27
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
    LGBMClassifierBase = ClassifierMixin
    LGBMLabelEncoder = LabelEncoder
except ImportError:
    SKLEARN_INSTALLED = False
    LGBMModelBase = object
    LGBMClassifierBase = object
    LGBMRegressorBase = object
    LGBMLabelEncoder = None

28
def _objective_function_wrapper(func):
wxchan's avatar
wxchan committed
29
30
31
32
33
34
35
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
36
37
38
        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
            y_true: array_like of shape [n_samples]
                The target values
39
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
40
41
42
                The predicted values
            group: array_like
                group/query data, used for ranking task
wxchan's avatar
wxchan committed
43
44
45
46
47
48
49

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

50
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
51
52
53
54
55
56
57
58
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
59
60
61
62
63
64
        argc = len(inspect.getargspec(func).args)
        if argc == 2:
            grad, hess = func(labels, preds)
        elif argc == 3:
            grad, hess = func(labels, preds, dataset.get_group())
        else:
65
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" %(argc))
wxchan's avatar
wxchan committed
66
67
68
69
70
71
72
73
74
75
76
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
77
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
78
79
80
81
82
83
84
85
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

86
87
88
89
90
91
92
def _eval_function_wrapper(func):
    """Decorate an eval function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
    Parameters
    ----------
    func: callable
93
94
95
96
97
        Expects a callable with following functions:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
         or ``func(y_true, y_pred, weight, group)``
            and return (eval_name->str, eval_result->float, is_bigger_better->Bool):
98
99
100

            y_true: array_like of shape [n_samples]
                The target values
101
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
102
103
104
105
106
107
108
109
110
111
112
113
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task

    Returns
    -------
    new_func: callable
        The new eval function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

114
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
        argc = len(inspect.getargspec(func).args)
        if argc == 2:
            return func(labels, preds)
        elif argc == 3:
            return func(labels, preds, dataset.get_weight())
        elif argc == 4:
            return func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
131
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" %(argc))
132
133
    return inner

wxchan's avatar
wxchan committed
134
135
class LGBMModel(LGBMModelBase):

136
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
wxchan's avatar
wxchan committed
137
138
139
140
141
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="regression",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
142
143
144
                 is_unbalance=False, seed=0,
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
wxchan's avatar
wxchan committed
145
146
147
148
149
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
150
151
152
        boosting_type : string
            gbdt, traditional Gradient Boosting Decision Tree
            dart, Dropouts meet Multiple Additive Regression Trees
wxchan's avatar
wxchan committed
153
154
155
156
157
158
159
160
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
            Boosting learning rate
        n_estimators : int
            Number of boosted trees to fit.
Guolin Ke's avatar
Guolin Ke committed
161
162
        max_bin : int
            Number of bucketed bin for feature values
wxchan's avatar
wxchan committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
        silent : boolean
            Whether to print messages while running boosting.
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            default: binary for LGBMClassifier, lambdarank for LGBMRanker
        nthread : int
            Number of parallel threads
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
            Minimum sum of instance weight(hessian) needed in a child(leaf)
        min_child_samples : int
            Minimum number of data need in a child(leaf)
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
            frequence of subsample, <=0 means no enable
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
            L1 regularization term on weights
        reg_lambda : float
            L2 regularization term on weights
        scale_pos_weight : float
            Balancing of positive and negative weights.
        is_unbalance : bool
            Is unbalance for binary classification
        seed : int
            Random number seed.
193
194
195
196
197
198
199
200
201
202
        drop_rate : float
            Only used when boosting_type='dart'. Probablity to select dropping trees.
        skip_drop : float
            Only used when boosting_type='dart'. Probablity to skip dropping trees.
        max_drop : int
            Only used when boosting_type='dart'. Max number of dropped trees in one iteration.
        uniform_drop : bool
            Only used when boosting_type='dart'. If true, drop trees uniformly, else drop according to weights.
        xgboost_dart_mode : bool
            Only used when boosting_type='dart'. Whether use xgboost dart mode.
wxchan's avatar
wxchan committed
203
204
205
206
207

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
208
        ``objective(y_true, y_pred) -> grad, hess``
wxchan's avatar
wxchan committed
209
210
211
212
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
                The target values
213
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
214
215
216
                The predicted values
            group: array_like
                group/query data, used for ranking task
217
            grad: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
218
                The value of the gradient for each sample point.
219
            hess: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
220
221
222
223
224
225
                The value of the second derivative for each sample point

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
wxchan's avatar
wxchan committed
226
        if not SKLEARN_INSTALLED:
227
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
228

229
        self.boosting_type = boosting_type
wxchan's avatar
wxchan committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
        self.silent = silent
        self.objective = objective
        self.nthread = nthread
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.is_unbalance = is_unbalance
        self.seed = seed
249
250
251
252
253
        self.drop_rate = drop_rate
        self.skip_drop = skip_drop
        self.max_drop = max_drop
        self.uniform_drop = uniform_drop
        self.xgboost_dart_mode = xgboost_dart_mode
wxchan's avatar
wxchan committed
254
        self._Booster = None
255
        self.evals_result = None
256
        self.best_iteration = -1
wxchan's avatar
wxchan committed
257
        if callable(self.objective):
258
            self.fobj = _objective_function_wrapper(self.objective)
wxchan's avatar
wxchan committed
259
260
261
        else:
            self.fobj = None

Guolin Ke's avatar
Guolin Ke committed
262
    def fit(self, X, y,
263
264
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
265
266
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
267
            early_stopping_rounds=None, verbose=True,
268
269
            feature_name=None, categorical_feature=None,
            callbacks=None):
wxchan's avatar
wxchan committed
270
271
272
273
274
275
276
277
278
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
284
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
285
286
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
Guolin Ke's avatar
Guolin Ke committed
287
288
289
290
291
292
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
293
294
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
295
            If callable, a custom evaluation metric, see note for more details.
wxchan's avatar
wxchan committed
296
297
298
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
Guolin Ke's avatar
Guolin Ke committed
299
        feature_name : list of str
300
301
            Feature names
        categorical_feature : list of str or int
wxchan's avatar
wxchan committed
302
303
            Categorical features,
            type int represents index,
304
            type str represents feature names (need to specify feature_name as well)
305
306
307
        callbacks : list of callback functions
            List of callback functions that are applied at each iteration.
            See Callbacks in Python-API.md for more information.
308
309
310

        Note
        ----
wxchan's avatar
wxchan committed
311
312
313
314
315
        Custom eval function expects a callable with following functions:
            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
            return (eval_name, eval_result, is_bigger_better)
                or list of (eval_name, eval_result, is_bigger_better)
316
317
318

            y_true: array_like of shape [n_samples]
                The target values
wxchan's avatar
wxchan committed
319
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
320
321
322
323
324
325
326
327
328
329
330
331
332
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task
            eval_name: str
                name of evaluation
            eval_result: float
                eval result
            is_bigger_better: bool
                is eval result bigger better, e.g. AUC is bigger_better.
        for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
wxchan's avatar
wxchan committed
333
334
335
        """
        evals_result = {}
        params = self.get_params()
336
337
338
339
340
        params['verbose'] = -1 if self.silent else 1
        if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
            params['num_class'] = self.n_classes_
        if hasattr(self, 'eval_at'):
            params['ndcg_eval_at'] = self.eval_at
wxchan's avatar
wxchan committed
341
        if self.fobj:
342
343
            # objective = nullptr for unknown objective
            params['objective'] = 'None'
wxchan's avatar
wxchan committed
344
345

        if callable(eval_metric):
346
            feval = _eval_function_wrapper(eval_metric)
wxchan's avatar
wxchan committed
347
348
        else:
            feval = None
349
            params['metric'] = eval_metric
wxchan's avatar
wxchan committed
350

Guolin Ke's avatar
Guolin Ke committed
351
        def _construct_dataset(X, y, sample_weight, init_score, group, params):
Guolin Ke's avatar
Guolin Ke committed
352
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
Guolin Ke's avatar
Guolin Ke committed
353
354
355
            ret.set_init_score(init_score)
            return ret

Guolin Ke's avatar
Guolin Ke committed
356
        train_set = _construct_dataset(X, y, sample_weight, init_score, group, params)
Guolin Ke's avatar
Guolin Ke committed
357
358
359
360
361
362
363
364
365
366

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
367
368
369
370
                    def get_meta_data(collection, i):
                        if collection is None:
                            return None
                        elif isinstance(collection, list):
371
                            return collection[i] if len(collection) > i else None
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
372
373
374
375
376
377
378
                        elif isinstance(collection, dict):
                            return collection.get(i, None)
                        else:
                            raise TypeError('eval_sample_weight, eval_init_score, and eval_group should be dict or list')
                    valid_weight = get_meta_data(eval_sample_weight, i)
                    valid_init_score = get_meta_data(eval_init_score, i)
                    valid_group = get_meta_data(eval_group, i)
Guolin Ke's avatar
Guolin Ke committed
379
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
380
381
382
383
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
                              self.n_estimators, valid_sets=valid_sets,
wxchan's avatar
wxchan committed
384
385
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
386
                              verbose_eval=verbose, feature_name=feature_name,
387
388
                              categorical_feature=categorical_feature,
                              callbacks=callbacks)
wxchan's avatar
wxchan committed
389
390

        if evals_result:
391
            self.evals_result = evals_result
wxchan's avatar
wxchan committed
392
393
394
395
396

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

397
    def predict(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
        """
        Return the predicted value for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
413
        return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
414
415

    def apply(self, X, num_iteration=0):
wxchan's avatar
wxchan committed
416
417
        """
        Return the predicted leaf every tree for each sample.
wxchan's avatar
wxchan committed
418
419
420
421
422
423

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

wxchan's avatar
wxchan committed
424
425
        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).
wxchan's avatar
wxchan committed
426
427
428
429
430

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
431
        return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
432

433
434
435
436
437
438
    @property
    def booster_(self):
        """Get the underlying lightgbm Booster of this model."""
        if self._Booster is None:
            raise LightGBMError('No booster found. Need to call fit beforehand.')
        return self._Booster
wxchan's avatar
wxchan committed
439

440
441
442
443
444
445
446
447
448
449
450
451
    @property
    def evals_result_(self):
        """Get the evaluation results."""
        if self.evals_result is None:
            raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
        return self.evals_result

    @property
    def feature_importance_(self):
        """Get normailized feature importances."""
        importace_array = self.booster_.feature_importance().astype(np.float32)
        return importace_array / importace_array.sum()
wxchan's avatar
wxchan committed
452

453
454
455
    @deprecated('Use attribute booster_ instead.')
    def booster(self):
        return self.booster_
wxchan's avatar
wxchan committed
456

457
    @deprecated('Use attribute feature_importance_ instead.')
458
    def feature_importance(self):
459
        return self.feature_importance_
wxchan's avatar
wxchan committed
460
461
462

class LGBMRegressor(LGBMModel, LGBMRegressorBase):

Guolin Ke's avatar
Guolin Ke committed
463
464
    def fit(self, X, y,
            sample_weight=None, init_score=None,
465
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
466
            eval_init_score=None,
wxchan's avatar
wxchan committed
467
            eval_metric="l2",
Guolin Ke's avatar
Guolin Ke committed
468
            early_stopping_rounds=None, verbose=True,
Guolin Ke's avatar
Guolin Ke committed
469
            feature_name=None, categorical_feature=None, callbacks=None):
470
471
472
473
474
475
476
477

        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
                                       init_score=init_score, eval_set=eval_set,
                                       eval_sample_weight=eval_sample_weight,
                                       eval_init_score=eval_init_score,
                                       eval_metric=eval_metric,
                                       early_stopping_rounds=early_stopping_rounds,
                                       verbose=verbose, feature_name=feature_name,
Guolin Ke's avatar
Guolin Ke committed
478
479
                                       categorical_feature=categorical_feature,
                                       callbacks=callbacks)
Guolin Ke's avatar
Guolin Ke committed
480
481
        return self

wxchan's avatar
wxchan committed
482
483
class LGBMClassifier(LGBMModel, LGBMClassifierBase):

484
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
485
486
487
488
489
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="binary",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
490
491
492
                 is_unbalance=False, seed=0,
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
493
        self.classes, self.n_classes = None, None
494
495
496
497
498
499
500
501
502
503
504
        super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                             max_depth=max_depth, learning_rate=learning_rate,
                                             n_estimators=n_estimators, max_bin=max_bin,
                                             silent=silent, objective=objective, nthread=nthread,
                                             min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                             min_child_samples=min_child_samples, subsample=subsample,
                                             subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                             reg_alpha=reg_alpha, reg_lambda=reg_lambda,
                                             scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance, seed=seed,
                                             drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
                                             uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
Guolin Ke's avatar
Guolin Ke committed
505
506
507

    def fit(self, X, y,
            sample_weight=None, init_score=None,
508
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
509
            eval_init_score=None,
wxchan's avatar
wxchan committed
510
            eval_metric="binary_logloss",
wxchan's avatar
wxchan committed
511
            early_stopping_rounds=None, verbose=True,
512
513
            feature_name=None, categorical_feature=None,
            callbacks=None):
514
515
516
        self._le = LGBMLabelEncoder().fit(y)
        y = self._le.transform(y)

517
518
519
        self.classes = self._le.classes_
        self.n_classes = len(self.classes_)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
520
521
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
wxchan's avatar
wxchan committed
522
523
            if eval_set is not None and eval_metric == "binary_logloss":
                eval_metric = "multi_logloss"
wxchan's avatar
wxchan committed
524
525

        if eval_set is not None:
526
527
528
529
530
531
532
533
534
            eval_set = [(x[0], self._le.transform(x[1])) for x in eval_set]

        super(LGBMClassifier, self).fit(X, y, sample_weight=sample_weight,
                                        init_score=init_score, eval_set=eval_set,
                                        eval_sample_weight=eval_sample_weight,
                                        eval_init_score=eval_init_score,
                                        eval_metric=eval_metric,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=verbose, feature_name=feature_name,
535
536
                                        categorical_feature=categorical_feature,
                                        callbacks=callbacks)
wxchan's avatar
wxchan committed
537
538
        return self

539
540
541
542
    def predict(self, X, raw_score=False, num_iteration=0):
        class_probs = self.predict_proba(X, raw_score, num_iteration)
        class_index = np.argmax(class_probs, axis=1)
        return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
543

544
    def predict_proba(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
        """
        Return the predicted probability for each class for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
560
561
        class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
562
563
            return class_probs
        else:
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
            return np.vstack((1. - class_probs, class_probs)).transpose()

    @property
    def classes_(self):
        """Get class label array."""
        if self.classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.classes

    @property
    def n_classes_(self):
        """Get number of classes"""
        if self.n_classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.n_classes
wxchan's avatar
wxchan committed
579
580
581

class LGBMRanker(LGBMModel):

582
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
Guolin Ke's avatar
Guolin Ke committed
583
584
585
586
587
                 learning_rate=0.1, n_estimators=10, max_bin=255,
                 silent=True, objective="lambdarank",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
588
589
590
591
592
593
594
595
596
597
598
599
600
601
                 is_unbalance=False, seed=0,
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
        super(LGBMRanker, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                         max_depth=max_depth, learning_rate=learning_rate,
                                         n_estimators=n_estimators, max_bin=max_bin,
                                         silent=silent, objective=objective, nthread=nthread,
                                         min_split_gain=min_split_gain, min_child_weight=min_child_weight,
                                         min_child_samples=min_child_samples, subsample=subsample,
                                         subsample_freq=subsample_freq, colsample_bytree=colsample_bytree,
                                         reg_alpha=reg_alpha, reg_lambda=reg_lambda,
                                         scale_pos_weight=scale_pos_weight, is_unbalance=is_unbalance, seed=seed,
                                         drop_rate=drop_rate, skip_drop=skip_drop, max_drop=max_drop,
                                         uniform_drop=uniform_drop, xgboost_dart_mode=xgboost_dart_mode)
Guolin Ke's avatar
Guolin Ke committed
602
603

    def fit(self, X, y,
604
605
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
606
            eval_init_score=None, eval_group=None,
wxchan's avatar
wxchan committed
607
            eval_metric='ndcg', eval_at=1,
wxchan's avatar
wxchan committed
608
            early_stopping_rounds=None, verbose=True,
609
610
            feature_name=None, categorical_feature=None,
            callbacks=None):
Guolin Ke's avatar
Guolin Ke committed
611
        """
wxchan's avatar
wxchan committed
612
        Most arguments like common methods except following:
Guolin Ke's avatar
Guolin Ke committed
613
614
615
616

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
617
618

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
619
        if group is None:
620
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
621
622

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
623
            if eval_group is None:
624
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
625
            elif len(eval_group) != len(eval_set):
626
                raise ValueError("Length of eval_group should equal to eval_set")
627
628
629
            elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))) \
                or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
                raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
630

Guolin Ke's avatar
Guolin Ke committed
631
        if eval_at is not None:
632
633
634
635
636
637
638
639
            self.eval_at = eval_at
        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
                                    init_score=init_score, group=group,
                                    eval_set=eval_set, eval_sample_weight=eval_sample_weight,
                                    eval_init_score=eval_init_score, eval_group=eval_group,
                                    eval_metric=eval_metric,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=verbose, feature_name=feature_name,
640
641
                                    categorical_feature=categorical_feature,
                                    callbacks=callbacks)
wxchan's avatar
wxchan committed
642
        return self