sklearn.py 26.8 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
# pylint: disable = invalid-name, W0105, C0111, C0301
wxchan's avatar
wxchan committed
3
4
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
5

wxchan's avatar
wxchan committed
6
import numpy as np
7
import warnings
8

wxchan's avatar
wxchan committed
9
10
11
12
from .basic import Dataset, LightGBMError
from .compat import (SKLEARN_INSTALLED, LGBMClassifierBase, LGBMDeprecated,
                     LGBMLabelEncoder, LGBMModelBase, LGBMRegressorBase, argc_,
                     range_)
wxchan's avatar
wxchan committed
13
from .engine import train
14

wxchan's avatar
wxchan committed
15

16
17
18
19
20
# DeprecationWarning is not shown by default, so let's create our own with higher level
class LGBMDeprecationWarning(UserWarning):
    pass


21
def _objective_function_wrapper(func):
wxchan's avatar
wxchan committed
22
23
24
25
26
27
28
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
          and you should group grad and hess in this way as well
    Parameters
    ----------
    func: callable
29
30
31
        Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group):
            y_true: array_like of shape [n_samples]
                The target values
32
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
33
34
35
                The predicted values
            group: array_like
                group/query data, used for ranking task
wxchan's avatar
wxchan committed
36
37
38
39
40
41
42

    Returns
    -------
    new_func: callable
        The new objective function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

43
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
44
45
46
47
48
49
50
51
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
52
        argc = argc_(func)
53
54
55
56
57
        if argc == 2:
            grad, hess = func(labels, preds)
        elif argc == 3:
            grad, hess = func(labels, preds, dataset.get_group())
        else:
wxchan's avatar
wxchan committed
58
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
wxchan's avatar
wxchan committed
59
60
61
62
63
64
65
66
67
68
69
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
70
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
71
72
                for k in range_(num_class):
                    for i in range_(num_data):
wxchan's avatar
wxchan committed
73
74
75
76
77
78
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess
    return inner

wxchan's avatar
wxchan committed
79

80
81
82
83
84
85
86
def _eval_function_wrapper(func):
    """Decorate an eval function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
    Parameters
    ----------
    func: callable
87
88
89
90
91
        Expects a callable with following functions:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
         or ``func(y_true, y_pred, weight, group)``
            and return (eval_name->str, eval_result->float, is_bigger_better->Bool):
92
93
94

            y_true: array_like of shape [n_samples]
                The target values
95
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
96
97
98
99
100
101
102
103
104
105
106
107
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task

    Returns
    -------
    new_func: callable
        The new eval function as expected by ``lightgbm.engine.train``.
        The signature is ``new_func(preds, dataset)``:

108
        preds: array_like, shape [n_samples] or shape[n_samples * n_class]
109
110
111
112
113
114
115
116
            The predicted values
        dataset: ``dataset``
            The training set from which the labels will be extracted using
            ``dataset.get_label()``
    """
    def inner(preds, dataset):
        """internal function"""
        labels = dataset.get_label()
wxchan's avatar
wxchan committed
117
        argc = argc_(func)
118
119
120
121
122
123
124
        if argc == 2:
            return func(labels, preds)
        elif argc == 3:
            return func(labels, preds, dataset.get_weight())
        elif argc == 4:
            return func(labels, preds, dataset.get_weight(), dataset.get_group())
        else:
wxchan's avatar
wxchan committed
125
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
126
127
    return inner

wxchan's avatar
wxchan committed
128

wxchan's avatar
wxchan committed
129
130
class LGBMModel(LGBMModelBase):

131
    def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
wxchan's avatar
wxchan committed
132
                 learning_rate=0.1, n_estimators=10, max_bin=255,
wxchan's avatar
wxchan committed
133
                 subsample_for_bin=50000, objective=None,
wxchan's avatar
wxchan committed
134
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
wxchan's avatar
wxchan committed
135
                 subsample=1, subsample_freq=1, colsample_bytree=1,
136
137
                 reg_alpha=0, reg_lambda=0, random_state=0,
                 n_jobs=-1, silent=True, **kwargs):
wxchan's avatar
wxchan committed
138
139
140
141
142
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
143
        boosting_type : string
144
145
            gbdt, traditional Gradient Boosting Decision Tree.
            dart, Dropouts meet Multiple Additive Regression Trees.
wxchan's avatar
wxchan committed
146
147
148
149
150
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
151
            Boosting learning rate.
wxchan's avatar
wxchan committed
152
153
        n_estimators : int
            Number of boosted trees to fit.
Guolin Ke's avatar
Guolin Ke committed
154
        max_bin : int
155
            Number of bucketed bin for feature values.
wxchan's avatar
wxchan committed
156
157
        subsample_for_bin : int
            Number of samples for constructing bins.
wxchan's avatar
wxchan committed
158
159
160
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
161
            default: binary for LGBMClassifier, lambdarank for LGBMRanker.
wxchan's avatar
wxchan committed
162
163
164
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
165
            Minimum sum of instance weight(hessian) needed in a child(leaf).
wxchan's avatar
wxchan committed
166
        min_child_samples : int
167
            Minimum number of data need in a child(leaf).
wxchan's avatar
wxchan committed
168
169
170
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
171
            frequence of subsample, <=0 means no enable.
wxchan's avatar
wxchan committed
172
173
174
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
175
            L1 regularization term on weights.
wxchan's avatar
wxchan committed
176
        reg_lambda : float
177
178
            L2 regularization term on weights.
        random_state : int
wxchan's avatar
wxchan committed
179
            Random number seed.
180
181
        n_jobs : int
            Number of parallel threads.
wxchan's avatar
wxchan committed
182
183
        silent : boolean
            Whether to print messages while running boosting.
wxchan's avatar
wxchan committed
184
185
186
        **kwargs : other parameters
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
            Note: **kwargs is not supported in sklearn, it may cause unexpected issues.
wxchan's avatar
wxchan committed
187
188
189
190
191

        Note
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
192
        ``objective(y_true, y_pred) -> grad, hess``
wxchan's avatar
wxchan committed
193
194
195
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
196
                The target values.
197
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class]
198
                The predicted values.
wxchan's avatar
wxchan committed
199
            group: array_like
200
                group/query data, used for ranking task.
201
            grad: array_like of shape [n_samples] or shape[n_samples * n_class]
wxchan's avatar
wxchan committed
202
                The value of the gradient for each sample point.
203
            hess: array_like of shape [n_samples] or shape[n_samples * n_class]
204
                The value of the second derivative for each sample point.
wxchan's avatar
wxchan committed
205
206
207
208
209

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
wxchan's avatar
wxchan committed
210
        if not SKLEARN_INSTALLED:
211
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
212

213
        self.boosting_type = boosting_type
wxchan's avatar
wxchan committed
214
215
216
217
218
219
220
221
222
223
224
        if objective is None:
            if isinstance(self, LGBMRegressor):
                self.objective = "regression"
            elif isinstance(self, LGBMClassifier):
                self.objective = "binary"
            elif isinstance(self, LGBMRanker):
                self.objective = "lambdarank"
            else:
                raise TypeError("Unknown LGBMModel type.")
        else:
            self.objective = objective
wxchan's avatar
wxchan committed
225
226
227
228
229
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_bin = max_bin
wxchan's avatar
wxchan committed
230
        self.subsample_for_bin = subsample_for_bin
wxchan's avatar
wxchan committed
231
232
233
234
235
236
237
238
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
239
240
        self.random_state = random_state
        self.n_jobs = n_jobs
wxchan's avatar
wxchan committed
241
        self.silent = silent
wxchan's avatar
wxchan committed
242
        self._Booster = None
243
        self.evals_result = None
244
        self.best_iteration = -1
wxchan's avatar
wxchan committed
245
        self.best_score = {}
wxchan's avatar
wxchan committed
246
        if callable(self.objective):
247
            self.fobj = _objective_function_wrapper(self.objective)
wxchan's avatar
wxchan committed
248
249
        else:
            self.fobj = None
250
251
        self.other_params = {}
        self.set_params(**kwargs)
wxchan's avatar
wxchan committed
252
253
254
255

    def get_params(self, deep=True):
        params = super(LGBMModel, self).get_params(deep=deep)
        params.update(self.other_params)
256
257
258
259
260
261
        if 'seed' in params:
            warnings.warn('The `seed` parameter is deprecated and will be removed in next version. '
                          'Please use `random_state` instead.', LGBMDeprecationWarning)
        if 'nthread' in params:
            warnings.warn('The `nthread` parameter is deprecated and will be removed in next version. '
                          'Please use `n_jobs` instead.', LGBMDeprecationWarning)
wxchan's avatar
wxchan committed
262
263
264
265
266
267
268
269
        return params

    # minor change to support `**kwargs`
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
            self.other_params[key] = value
        return self
wxchan's avatar
wxchan committed
270

Guolin Ke's avatar
Guolin Ke committed
271
    def fit(self, X, y,
272
            sample_weight=None, init_score=None, group=None,
273
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
274
275
            eval_init_score=None, eval_group=None,
            eval_metric=None,
wxchan's avatar
wxchan committed
276
            early_stopping_rounds=None, verbose=True,
277
            feature_name='auto', categorical_feature='auto',
278
            callbacks=None):
wxchan's avatar
wxchan committed
279
280
281
282
283
284
285
286
287
        """
        Fit the gradient boosting model

        Parameters
        ----------
        X : array_like
            Feature matrix
        y : array_like
            Labels
Guolin Ke's avatar
Guolin Ke committed
288
289
290
291
292
293
        sample_weight : array_like
            weight of training data
        init_score : array_like
            init score of training data
        group : array_like
            group data of training data
wxchan's avatar
wxchan committed
294
295
        eval_set : list, optional
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
296
297
        eval_names: list of string
            Names of eval_set
Guolin Ke's avatar
Guolin Ke committed
298
299
300
301
302
303
        eval_sample_weight : List of array
            weight of eval data
        eval_init_score : List of array
            init score of eval data
        eval_group : List of array
            group data of eval data
wxchan's avatar
wxchan committed
304
305
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
306
            If callable, a custom evaluation metric, see note for more details.
wxchan's avatar
wxchan committed
307
308
309
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
wxchan's avatar
wxchan committed
310
        feature_name : list of str, or 'auto'
311
            Feature names
wxchan's avatar
wxchan committed
312
            If 'auto' and data is pandas DataFrame, use data columns name
313
314
315
316
317
        categorical_feature : list of str or int, or 'auto'
            Categorical features,
            type int represents index,
            type str represents feature names (need to specify feature_name as well)
            If 'auto' and data is pandas DataFrame, use pandas categorical columns
318
319
320
        callbacks : list of callback functions
            List of callback functions that are applied at each iteration.
            See Callbacks in Python-API.md for more information.
321
322
323

        Note
        ----
wxchan's avatar
wxchan committed
324
325
326
327
328
        Custom eval function expects a callable with following functions:
            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
            return (eval_name, eval_result, is_bigger_better)
                or list of (eval_name, eval_result, is_bigger_better)
329
330
331

            y_true: array_like of shape [n_samples]
                The target values
wxchan's avatar
wxchan committed
332
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
333
334
335
336
337
338
339
340
341
342
343
344
345
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
            group: array_like
                group/query data, used for ranking task
            eval_name: str
                name of evaluation
            eval_result: float
                eval result
            is_bigger_better: bool
                is eval result bigger better, e.g. AUC is bigger_better.
        for multi-class task, the y_pred is group by class_id first, then group by row_id
          if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
wxchan's avatar
wxchan committed
346
347
348
        """
        evals_result = {}
        params = self.get_params()
349
350
351
        # sklearn interface has another naming convention
        params.setdefault('seed', params.pop('random_state'))
        params.setdefault('nthread', params.pop('n_jobs'))
wxchan's avatar
wxchan committed
352
353
354
355
356
        # user can set verbose with kwargs, it has higher priority
        if 'verbose' not in params and self.silent:
            params['verbose'] = -1
        params.pop('silent', None)
        params.pop('n_estimators', None)
357
358
        if hasattr(self, 'n_classes_') and self.n_classes_ > 2:
            params['num_class'] = self.n_classes_
359
360
        if hasattr(self, 'eval_at'):
            params['ndcg_eval_at'] = self.eval_at
wxchan's avatar
wxchan committed
361
        if self.fobj:
wxchan's avatar
wxchan committed
362
            params['objective'] = 'None'  # objective = nullptr for unknown objective
wxchan's avatar
wxchan committed
363
364

        if callable(eval_metric):
365
            feval = _eval_function_wrapper(eval_metric)
wxchan's avatar
wxchan committed
366
367
        else:
            feval = None
368
            params['metric'] = eval_metric
wxchan's avatar
wxchan committed
369

Guolin Ke's avatar
Guolin Ke committed
370
        def _construct_dataset(X, y, sample_weight, init_score, group, params):
Guolin Ke's avatar
Guolin Ke committed
371
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
Guolin Ke's avatar
Guolin Ke committed
372
373
374
            ret.set_init_score(init_score)
            return ret

Guolin Ke's avatar
Guolin Ke committed
375
        train_set = _construct_dataset(X, y, sample_weight, init_score, group, params)
Guolin Ke's avatar
Guolin Ke committed
376
377
378
379
380
381
382
383
384
385

        valid_sets = []
        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
                """reduce cost for prediction training data"""
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
386
387
388
389
                    def get_meta_data(collection, i):
                        if collection is None:
                            return None
                        elif isinstance(collection, list):
390
                            return collection[i] if len(collection) > i else None
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
391
392
393
394
395
396
397
                        elif isinstance(collection, dict):
                            return collection.get(i, None)
                        else:
                            raise TypeError('eval_sample_weight, eval_init_score, and eval_group should be dict or list')
                    valid_weight = get_meta_data(eval_sample_weight, i)
                    valid_init_score = get_meta_data(eval_init_score, i)
                    valid_group = get_meta_data(eval_group, i)
Guolin Ke's avatar
Guolin Ke committed
398
                    valid_set = _construct_dataset(valid_data[0], valid_data[1], valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
399
400
401
                valid_sets.append(valid_set)

        self._Booster = train(params, train_set,
402
                              self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
wxchan's avatar
wxchan committed
403
404
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
405
                              verbose_eval=verbose, feature_name=feature_name,
406
                              categorical_feature=categorical_feature,
407
                              callbacks=callbacks)
wxchan's avatar
wxchan committed
408
409

        if evals_result:
410
            self.evals_result = evals_result
wxchan's avatar
wxchan committed
411
412
413

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
wxchan's avatar
wxchan committed
414
        self.best_score = self._Booster.best_score
wxchan's avatar
wxchan committed
415
416
417
418

        # free dataset
        self.booster_.free_dataset()
        del train_set, valid_sets
wxchan's avatar
wxchan committed
419
420
        return self

421
    def predict(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
        """
        Return the predicted value for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
437
        return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
438
439

    def apply(self, X, num_iteration=0):
wxchan's avatar
wxchan committed
440
441
        """
        Return the predicted leaf every tree for each sample.
wxchan's avatar
wxchan committed
442
443
444
445
446
447

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

wxchan's avatar
wxchan committed
448
449
        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).
wxchan's avatar
wxchan committed
450
451
452
453
454

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
455
        return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
wxchan's avatar
wxchan committed
456

457
458
459
460
461
462
    @property
    def booster_(self):
        """Get the underlying lightgbm Booster of this model."""
        if self._Booster is None:
            raise LightGBMError('No booster found. Need to call fit beforehand.')
        return self._Booster
wxchan's avatar
wxchan committed
463

464
465
466
467
468
469
470
471
    @property
    def evals_result_(self):
        """Get the evaluation results."""
        if self.evals_result is None:
            raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
        return self.evals_result

    @property
472
    def feature_importances_(self):
473
474
475
476
477
478
479
        """
        Get feature importances.

        Note: feature importance in sklearn interface used to normalize to 1,
            it's deprecated after 2.0.4 and same as Booster.feature_importance() now
        """
        return self.booster_.feature_importance()
wxchan's avatar
wxchan committed
480

wxchan's avatar
wxchan committed
481
    @LGBMDeprecated('Use attribute booster_ instead.')
482
483
    def booster(self):
        return self.booster_
wxchan's avatar
wxchan committed
484

485
    @LGBMDeprecated('Use attribute feature_importances_ instead.')
486
    def feature_importance(self):
487
        return self.feature_importances_
wxchan's avatar
wxchan committed
488

wxchan's avatar
wxchan committed
489

wxchan's avatar
wxchan committed
490
491
class LGBMRegressor(LGBMModel, LGBMRegressorBase):

Guolin Ke's avatar
Guolin Ke committed
492
493
    def fit(self, X, y,
            sample_weight=None, init_score=None,
494
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
495
            eval_init_score=None,
wxchan's avatar
wxchan committed
496
            eval_metric="l2",
Guolin Ke's avatar
Guolin Ke committed
497
            early_stopping_rounds=None, verbose=True,
498
            feature_name='auto', categorical_feature='auto', callbacks=None):
499
500
501

        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
                                       init_score=init_score, eval_set=eval_set,
502
                                       eval_names=eval_names,
503
504
505
506
507
                                       eval_sample_weight=eval_sample_weight,
                                       eval_init_score=eval_init_score,
                                       eval_metric=eval_metric,
                                       early_stopping_rounds=early_stopping_rounds,
                                       verbose=verbose, feature_name=feature_name,
508
                                       categorical_feature=categorical_feature,
Guolin Ke's avatar
Guolin Ke committed
509
                                       callbacks=callbacks)
Guolin Ke's avatar
Guolin Ke committed
510
511
        return self

wxchan's avatar
wxchan committed
512

wxchan's avatar
wxchan committed
513
514
class LGBMClassifier(LGBMModel, LGBMClassifierBase):

Guolin Ke's avatar
Guolin Ke committed
515
516
    def fit(self, X, y,
            sample_weight=None, init_score=None,
517
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
518
            eval_init_score=None,
wxchan's avatar
wxchan committed
519
            eval_metric="logloss",
wxchan's avatar
wxchan committed
520
            early_stopping_rounds=None, verbose=True,
521
            feature_name='auto', categorical_feature='auto',
522
            callbacks=None):
523
        self._le = LGBMLabelEncoder().fit(y)
524
        _y = self._le.transform(y)
525

526
527
528
        self.classes = self._le.classes_
        self.n_classes = len(self.classes_)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
529
530
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
wxchan's avatar
wxchan committed
531
            if eval_metric == 'logloss' or eval_metric == 'binary_logloss':
wxchan's avatar
wxchan committed
532
                eval_metric = "multi_logloss"
wxchan's avatar
wxchan committed
533
534
535
536
537
538
539
            elif eval_metric == 'error' or eval_metric == 'binary_error':
                eval_metric = "multi_error"
        else:
            if eval_metric == 'logloss' or eval_metric == 'multi_logloss':
                eval_metric = 'binary_logloss'
            elif eval_metric == 'error' or eval_metric == 'multi_error':
                eval_metric = 'binary_error'
wxchan's avatar
wxchan committed
540
541

        if eval_set is not None:
542
543
544
545
546
547
548
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, (valid_x, valid_y) in enumerate(eval_set):
                if valid_x is X and valid_y is y:
                    eval_set[i] = (valid_x, _y)
                else:
                    eval_set[i] = (valid_x, self._le.transform(valid_y))
549

550
        super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight,
551
                                        init_score=init_score, eval_set=eval_set,
552
                                        eval_names=eval_names,
553
554
555
556
557
                                        eval_sample_weight=eval_sample_weight,
                                        eval_init_score=eval_init_score,
                                        eval_metric=eval_metric,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=verbose, feature_name=feature_name,
558
                                        categorical_feature=categorical_feature,
559
                                        callbacks=callbacks)
wxchan's avatar
wxchan committed
560
561
        return self

562
563
564
565
    def predict(self, X, raw_score=False, num_iteration=0):
        class_probs = self.predict_proba(X, raw_score, num_iteration)
        class_index = np.argmax(class_probs, axis=1)
        return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
566

567
    def predict_proba(self, X, raw_score=False, num_iteration=0):
wxchan's avatar
wxchan committed
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
        """
        Return the predicted probability for each class for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        num_iteration : int
            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
583
584
        class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
        if self.n_classes > 2:
wxchan's avatar
wxchan committed
585
586
            return class_probs
        else:
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
            return np.vstack((1. - class_probs, class_probs)).transpose()

    @property
    def classes_(self):
        """Get class label array."""
        if self.classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.classes

    @property
    def n_classes_(self):
        """Get number of classes"""
        if self.n_classes is None:
            raise LightGBMError('No classes found. Need to call fit beforehand.')
        return self.n_classes
wxchan's avatar
wxchan committed
602

wxchan's avatar
wxchan committed
603

wxchan's avatar
wxchan committed
604
605
class LGBMRanker(LGBMModel):

Guolin Ke's avatar
Guolin Ke committed
606
    def fit(self, X, y,
607
            sample_weight=None, init_score=None, group=None,
608
            eval_set=None, eval_names=None, eval_sample_weight=None,
Guolin Ke's avatar
Guolin Ke committed
609
            eval_init_score=None, eval_group=None,
wxchan's avatar
wxchan committed
610
            eval_metric='ndcg', eval_at=1,
wxchan's avatar
wxchan committed
611
            early_stopping_rounds=None, verbose=True,
612
            feature_name='auto', categorical_feature='auto',
613
            callbacks=None):
Guolin Ke's avatar
Guolin Ke committed
614
        """
wxchan's avatar
wxchan committed
615
        Most arguments like common methods except following:
Guolin Ke's avatar
Guolin Ke committed
616
617
618
619

        eval_at : list of int
            The evaulation positions of NDCG
        """
wxchan's avatar
wxchan committed
620
621

        """check group data"""
Guolin Ke's avatar
Guolin Ke committed
622
        if group is None:
623
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
624
625

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
626
            if eval_group is None:
627
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
628
            elif len(eval_group) != len(eval_set):
629
                raise ValueError("Length of eval_group should equal to eval_set")
wxchan's avatar
wxchan committed
630
            elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range_(len(eval_group)))) \
wxchan's avatar
wxchan committed
631
                    or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
632
                raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
633

Guolin Ke's avatar
Guolin Ke committed
634
        if eval_at is not None:
635
636
637
            self.eval_at = eval_at
        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
                                    init_score=init_score, group=group,
638
639
                                    eval_set=eval_set, eval_names=eval_names,
                                    eval_sample_weight=eval_sample_weight,
640
641
642
643
                                    eval_init_score=eval_init_score, eval_group=eval_group,
                                    eval_metric=eval_metric,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=verbose, feature_name=feature_name,
644
                                    categorical_feature=categorical_feature,
645
                                    callbacks=callbacks)
wxchan's avatar
wxchan committed
646
        return self