sklearn.py 26 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111, C0301
wxchan's avatar
wxchan committed
3
4
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
5

wxchan's avatar
wxchan committed
6
import numpy as np
7

wxchan's avatar
wxchan committed
8
9
10
11
from .basic import Dataset, LightGBMError
from .compat import (SKLEARN_INSTALLED, LGBMClassifierBase, LGBMDeprecated,
                     LGBMLabelEncoder, LGBMModelBase, LGBMRegressorBase, argc_,
                     range_)
wxchan's avatar
wxchan committed
12
from .engine import train
13

wxchan's avatar
wxchan committed
14

15
def _objective_function_wrapper(func):
wxchan's avatar
wxchan committed
16
17
18
19
20
21
22
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
23
24
25
        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
            y_true: array_like of shape [n_samples]
                The target values
26
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
27
28
29
                The predicted values
            group: array_like
                group/query data, used for ranking task
wxchan's avatar
wxchan committed
30
31
32
33
34
35
36

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

37
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
38
39
40
41
42
43
44
45
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
46
        argc = argc_(func)
47
48
49
50
51
        if argc == 2:
            grad, hess = func(labels, preds)
        elif argc == 3:
            grad, hess = func(labels, preds, dataset.get_group())
        else:
wxchan's avatar
wxchan committed
52
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
wxchan's avatar
wxchan committed
53
54
55
56
57
58
59
60
61
62
63
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
64
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
65
66
                for k in range_(num_class):
                    for i in range_(num_data):
wxchan's avatar
wxchan committed
67
68
69
70
71
72
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

wxchan's avatar
wxchan committed
73

74
75
76
77
78
79
80
def _eval_function_wrapper(func):
    """Decorate an eval function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
    Parameters
    ----------
    func: callable
81
82
83
84
85
        Expects a callable with following functions:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
         or ``func(y_true, y_pred, weight, group)``
            and return (eval_name->str, eval_result->float, is_bigger_better->Bool):
86
87
88

            y_true: array_like of shape [n_samples]
                The target values
89
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
90
91
92
93
94
95
96
97
98
99
100
101
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task

    Returns
    -------
    new_func: callable
        The new eval function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

102
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
103
104
105
106
107
108
109
110
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
111
        argc = argc_(func)
112
113
114
115
116
117
118
        if argc == 2:
            return func(labels, preds)
        elif argc == 3:
            return func(labels, preds, dataset.get_weight())
        elif argc == 4:
            return func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
wxchan's avatar
wxchan committed
119
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
120
121
    return inner

wxchan's avatar
wxchan committed
122

wxchan's avatar
wxchan committed
123
124
class LGBMModel(LGBMModelBase):

125
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
wxchan's avatar
wxchan committed
126
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
127
                 subsample_for_bin=50000, objective=None,
wxchan's avatar
wxchan committed
128
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
wxchan's avatar
wxchan committed
129
                 subsample=1, subsample_freq=1, colsample_bytree=1,
wxchan's avatar
wxchan committed
130
                 reg_alpha=0, reg_lambda=0, seed=0, nthread=-1, silent=True, **kwargs):
wxchan's avatar
wxchan committed
131
132
133
134
135
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
136
137
138
        boosting_type : string
            gbdt, traditional Gradient Boosting Decision Tree
            dart, Dropouts meet Multiple Additive Regression Trees
wxchan's avatar
wxchan committed
139
140
141
142
143
144
145
146
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
            Boosting learning rate
        n_estimators : int
            Number of boosted trees to fit.
Guolin Ke's avatar
Guolin Ke committed
147
148
        max_bin : int
            Number of bucketed bin for feature values
wxchan's avatar
wxchan committed
149
150
        subsample_for_bin : int
            Number of samples for constructing bins.
wxchan's avatar
wxchan committed
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            default: binary for LGBMClassifier, lambdarank for LGBMRanker
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
            Minimum sum of instance weight(hessian) needed in a child(leaf)
        min_child_samples : int
            Minimum number of data need in a child(leaf)
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
            frequence of subsample, <=0 means no enable
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
            L1 regularization term on weights
        reg_lambda : float
            L2 regularization term on weights
        seed : int
            Random number seed.
wxchan's avatar
wxchan committed
173
174
175
176
        nthread : int
            Number of parallel threads
        silent : boolean
            Whether to print messages while running boosting.
wxchan's avatar
wxchan committed
177
178
179
        **kwargs : other parameters
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
            Note: **kwargs is not supported in sklearn, it may cause unexpected issues.
wxchan's avatar
wxchan committed
180
181
182
183
184

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
185
        ``objective(y_true, y_pred) -> grad, hess``
wxchan's avatar
wxchan committed
186
187
188
189
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
                The target values
190
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
191
192
193
                The predicted values
            group: array_like
                group/query data, used for ranking task
194
            grad: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
195
                The value of the gradient for each sample point.
196
            hess: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
197
198
199
200
201
202
                The value of the second derivative for each sample point

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
wxchan's avatar
wxchan committed
203
        if not SKLEARN_INSTALLED:
204
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
205

206
        self.boosting_type = boosting_type
wxchan's avatar
wxchan committed
207
208
209
210
211
212
213
214
215
216
217
        if objective is None:
            if isinstance(self, LGBMRegressor):
                self.objective = "regression"
            elif isinstance(self, LGBMClassifier):
                self.objective = "binary"
            elif isinstance(self, LGBMRanker):
                self.objective = "lambdarank"
            else:
                raise TypeError("Unknown LGBMModel type.")
        else:
            self.objective = objective
wxchan's avatar
wxchan committed
218
219
220
221
222
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
wxchan's avatar
wxchan committed
223
        self.subsample_for_bin = subsample_for_bin
wxchan's avatar
wxchan committed
224
225
226
227
228
229
230
231
232
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.seed = seed
wxchan's avatar
wxchan committed
233
234
        self.nthread = nthread
        self.silent = silent
wxchan's avatar
wxchan committed
235
        self._Booster = None
236
        self.evals_result = None
237
        self.best_iteration = -1
wxchan's avatar
wxchan committed
238
        self.best_score = {}
wxchan's avatar
wxchan committed
239
        if callable(self.objective):
240
            self.fobj = _objective_function_wrapper(self.objective)
wxchan's avatar
wxchan committed
241
242
        else:
            self.fobj = None
243
244
        self.other_params = {}
        self.set_params(**kwargs)
wxchan's avatar
wxchan committed
245
246
247
248
249
250
251
252
253
254
255
256

    def get_params(self, deep=True):
        params = super(LGBMModel, self).get_params(deep=deep)
        params.update(self.other_params)
        return params

    # minor change to support `**kwargs`
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
            self.other_params[key] = value
        return self
wxchan's avatar
wxchan committed
257

Guolin Ke's avatar
Guolin Ke committed
258
    def fit(self, X, y,
259
            sample_weight=None, init_score=None, group=None,
260
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
261
262
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
263
            early_stopping_rounds=None, verbose=True,
264
            feature_name='auto', categorical_feature='auto',
265
            callbacks=None):
wxchan's avatar
wxchan committed
266
267
268
269
270
271
272
273
274
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
275
276
277
278
279
280
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
281
282
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
283
284
        eval_names: list of string
            Names of eval_set
Guolin Ke's avatar
Guolin Ke committed
285
286
287
288
289
290
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
291
292
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
293
            If callable, a custom evaluation metric, see note for more details.
wxchan's avatar
wxchan committed
294
295
296
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
wxchan's avatar
wxchan committed
297
        feature_name : list of str, or 'auto'
298
            Feature names
wxchan's avatar
wxchan committed
299
            If 'auto' and data is pandas DataFrame, use data columns name
300
301
302
303
304
        categorical_feature : list of str or int, or 'auto'
            Categorical features,
            type int represents index,
            type str represents feature names (need to specify feature_name as well)
            If 'auto' and data is pandas DataFrame, use pandas categorical columns
305
306
307
        callbacks : list of callback functions
            List of callback functions that are applied at each iteration.
            See Callbacks in Python-API.md for more information.
308
309
310

        Note
        ----
wxchan's avatar
wxchan committed
311
312
313
314
315
        Custom eval function expects a callable with following functions:
            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
            return (eval_name, eval_result, is_bigger_better)
                or list of (eval_name, eval_result, is_bigger_better)
316
317
318

            y_true: array_like of shape [n_samples]
                The target values
wxchan's avatar
wxchan committed
319
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
320
321
322
323
324
325
326
327
328
329
330
331
332
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task
            eval_name: str
                name of evaluation
            eval_result: float
                eval result
            is_bigger_better: bool
                is eval result bigger better, e.g. AUC is bigger_better.
        for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
wxchan's avatar
wxchan committed
333
334
335
        """
        evals_result = {}
        params = self.get_params()
wxchan's avatar
wxchan committed
336
337
338
339
340
        # user can set verbose with kwargs, it has higher priority
        if 'verbose' not in params and self.silent:
            params['verbose'] = -1
        params.pop('silent', None)
        params.pop('n_estimators', None)
341
342
        if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
            params['num_class'] = self.n_classes_
343
344
        if hasattr(self, 'eval_at'):
            params['ndcg_eval_at'] = self.eval_at
wxchan's avatar
wxchan committed
345
        if self.fobj:
wxchan's avatar
wxchan committed
346
            params['objective'] = 'None'  # objective = nullptr for unknown objective
wxchan's avatar
wxchan committed
347
348

        if callable(eval_metric):
349
            feval = _eval_function_wrapper(eval_metric)
wxchan's avatar
wxchan committed
350
351
        else:
            feval = None
352
            params['metric'] = eval_metric
wxchan's avatar
wxchan committed
353

Guolin Ke's avatar
Guolin Ke committed
354
        def _construct_dataset(X, y, sample_weight, init_score, group, params):
Guolin Ke's avatar
Guolin Ke committed
355
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
Guolin Ke's avatar
Guolin Ke committed
356
357
358
            ret.set_init_score(init_score)
            return ret

Guolin Ke's avatar
Guolin Ke committed
359
        train_set = _construct_dataset(X, y, sample_weight, init_score, group, params)
Guolin Ke's avatar
Guolin Ke committed
360
361
362
363
364
365
366
367
368
369

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
370
371
372
373
                    def get_meta_data(collection, i):
                        if collection is None:
                            return None
                        elif isinstance(collection, list):
374
                            return collection[i] if len(collection) > i else None
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
375
376
377
378
379
380
381
                        elif isinstance(collection, dict):
                            return collection.get(i, None)
                        else:
                            raise TypeError('eval_sample_weight, eval_init_score, and eval_group should be dict or list')
                    valid_weight = get_meta_data(eval_sample_weight, i)
                    valid_init_score = get_meta_data(eval_init_score, i)
                    valid_group = get_meta_data(eval_group, i)
Guolin Ke's avatar
Guolin Ke committed
382
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
383
384
385
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
386
                              self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
wxchan's avatar
wxchan committed
387
388
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
389
                              verbose_eval=verbose, feature_name=feature_name,
390
                              categorical_feature=categorical_feature,
391
                              callbacks=callbacks)
wxchan's avatar
wxchan committed
392
393

        if evals_result:
394
            self.evals_result = evals_result
wxchan's avatar
wxchan committed
395
396
397

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
wxchan's avatar
wxchan committed
398
        self.best_score = self._Booster.best_score
wxchan's avatar
wxchan committed
399
400
401
402

        # free dataset
        self.booster_.free_dataset()
        del train_set, valid_sets
wxchan's avatar
wxchan committed
403
404
        return self

405
    def predict(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
        """
        Return the predicted value for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
421
        return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
422
423

    def apply(self, X, num_iteration=0):
wxchan's avatar
wxchan committed
424
425
        """
        Return the predicted leaf every tree for each sample.
wxchan's avatar
wxchan committed
426
427
428
429
430
431

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

wxchan's avatar
wxchan committed
432
433
        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).
wxchan's avatar
wxchan committed
434
435
436
437
438

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
439
        return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
440

441
442
443
444
445
446
    @property
    def booster_(self):
        """Get the underlying lightgbm Booster of this model."""
        if self._Booster is None:
            raise LightGBMError('No booster found. Need to call fit beforehand.')
        return self._Booster
wxchan's avatar
wxchan committed
447

448
449
450
451
452
453
454
455
    @property
    def evals_result_(self):
        """Get the evaluation results."""
        if self.evals_result is None:
            raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
        return self.evals_result

    @property
456
    def feature_importances_(self):
457
458
459
460
461
462
463
        """
        Get feature importances.

        Note: feature importance in sklearn interface used to normalize to 1,
            it's deprecated after 2.0.4 and same as Booster.feature_importance() now
        """
        return self.booster_.feature_importance()
wxchan's avatar
wxchan committed
464

wxchan's avatar
wxchan committed
465
    @LGBMDeprecated('Use attribute booster_ instead.')
466
467
    def booster(self):
        return self.booster_
wxchan's avatar
wxchan committed
468

469
    @LGBMDeprecated('Use attribute feature_importances_ instead.')
470
    def feature_importance(self):
471
        return self.feature_importances_
wxchan's avatar
wxchan committed
472

wxchan's avatar
wxchan committed
473

wxchan's avatar
wxchan committed
474
475
class LGBMRegressor(LGBMModel, LGBMRegressorBase):

Guolin Ke's avatar
Guolin Ke committed
476
477
    def fit(self, X, y,
            sample_weight=None, init_score=None,
478
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
479
            eval_init_score=None,
wxchan's avatar
wxchan committed
480
            eval_metric="l2",
Guolin Ke's avatar
Guolin Ke committed
481
            early_stopping_rounds=None, verbose=True,
482
            feature_name='auto', categorical_feature='auto', callbacks=None):
483
484
485

        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
                                       init_score=init_score, eval_set=eval_set,
486
                                       eval_names=eval_names,
487
488
489
490
491
                                       eval_sample_weight=eval_sample_weight,
                                       eval_init_score=eval_init_score,
                                       eval_metric=eval_metric,
                                       early_stopping_rounds=early_stopping_rounds,
                                       verbose=verbose, feature_name=feature_name,
492
                                       categorical_feature=categorical_feature,
Guolin Ke's avatar
Guolin Ke committed
493
                                       callbacks=callbacks)
Guolin Ke's avatar
Guolin Ke committed
494
495
        return self

wxchan's avatar
wxchan committed
496

wxchan's avatar
wxchan committed
497
498
class LGBMClassifier(LGBMModel, LGBMClassifierBase):

Guolin Ke's avatar
Guolin Ke committed
499
500
    def fit(self, X, y,
            sample_weight=None, init_score=None,
501
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
502
            eval_init_score=None,
wxchan's avatar
wxchan committed
503
            eval_metric="logloss",
wxchan's avatar
wxchan committed
504
            early_stopping_rounds=None, verbose=True,
505
            feature_name='auto', categorical_feature='auto',
506
            callbacks=None):
507
        self._le = LGBMLabelEncoder().fit(y)
508
        _y = self._le.transform(y)
509

510
511
512
        self.classes = self._le.classes_
        self.n_classes = len(self.classes_)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
513
514
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
wxchan's avatar
wxchan committed
515
            if eval_metric == 'logloss' or eval_metric == 'binary_logloss':
wxchan's avatar
wxchan committed
516
                eval_metric = "multi_logloss"
wxchan's avatar
wxchan committed
517
518
519
520
521
522
523
            elif eval_metric == 'error' or eval_metric == 'binary_error':
                eval_metric = "multi_error"
        else:
            if eval_metric == 'logloss' or eval_metric == 'multi_logloss':
                eval_metric = 'binary_logloss'
            elif eval_metric == 'error' or eval_metric == 'multi_error':
                eval_metric = 'binary_error'
wxchan's avatar
wxchan committed
524
525

        if eval_set is not None:
526
527
528
529
530
531
532
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, (valid_x, valid_y) in enumerate(eval_set):
                if valid_x is X and valid_y is y:
                    eval_set[i] = (valid_x, _y)
                else:
                    eval_set[i] = (valid_x, self._le.transform(valid_y))
533

534
        super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight,
535
                                        init_score=init_score, eval_set=eval_set,
536
                                        eval_names=eval_names,
537
538
539
540
541
                                        eval_sample_weight=eval_sample_weight,
                                        eval_init_score=eval_init_score,
                                        eval_metric=eval_metric,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=verbose, feature_name=feature_name,
542
                                        categorical_feature=categorical_feature,
543
                                        callbacks=callbacks)
wxchan's avatar
wxchan committed
544
545
        return self

546
547
548
549
    def predict(self, X, raw_score=False, num_iteration=0):
        class_probs = self.predict_proba(X, raw_score, num_iteration)
        class_index = np.argmax(class_probs, axis=1)
        return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
550

551
    def predict_proba(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
        """
        Return the predicted probability for each class for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
567
568
        class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
569
570
            return class_probs
        else:
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
            return np.vstack((1. - class_probs, class_probs)).transpose()

    @property
    def classes_(self):
        """Get class label array."""
        if self.classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.classes

    @property
    def n_classes_(self):
        """Get number of classes"""
        if self.n_classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.n_classes
wxchan's avatar
wxchan committed
586

wxchan's avatar
wxchan committed
587

wxchan's avatar
wxchan committed
588
589
class LGBMRanker(LGBMModel):

Guolin Ke's avatar
Guolin Ke committed
590
    def fit(self, X, y,
591
            sample_weight=None, init_score=None, group=None,
592
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
593
            eval_init_score=None, eval_group=None,
wxchan's avatar
wxchan committed
594
            eval_metric='ndcg', eval_at=1,
wxchan's avatar
wxchan committed
595
            early_stopping_rounds=None, verbose=True,
596
            feature_name='auto', categorical_feature='auto',
597
            callbacks=None):
Guolin Ke's avatar
Guolin Ke committed
598
        """
wxchan's avatar
wxchan committed
599
        Most arguments like common methods except following:
Guolin Ke's avatar
Guolin Ke committed
600
601
602
603

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
604
605

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
606
        if group is None:
607
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
608
609

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
610
            if eval_group is None:
611
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
612
            elif len(eval_group) != len(eval_set):
613
                raise ValueError("Length of eval_group should equal to eval_set")
wxchan's avatar
wxchan committed
614
            elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range_(len(eval_group)))) \
wxchan's avatar
wxchan committed
615
                    or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
616
                raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
617

Guolin Ke's avatar
Guolin Ke committed
618
        if eval_at is not None:
619
620
621
            self.eval_at = eval_at
        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
                                    init_score=init_score, group=group,
622
623
                                    eval_set=eval_set, eval_names=eval_names,
                                    eval_sample_weight=eval_sample_weight,
624
625
626
627
                                    eval_init_score=eval_init_score, eval_group=eval_group,
                                    eval_metric=eval_metric,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=verbose, feature_name=feature_name,
628
                                    categorical_feature=categorical_feature,
629
                                    callbacks=callbacks)
wxchan's avatar
wxchan committed
630
        return self