sklearn.py 45.8 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Scikit-learn wrapper interface for LightGBM."""
wxchan's avatar
wxchan committed
3
from __future__ import absolute_import
4

5
6
import warnings

wxchan's avatar
wxchan committed
7
import numpy as np
8

9
from .basic import Dataset, LightGBMError, _ConfigAliases
10
from .compat import (SKLEARN_INSTALLED, _LGBMClassifierBase,
11
                     LGBMNotFittedError, _LGBMLabelEncoder, _LGBMModelBase,
12
                     _LGBMRegressorBase, _LGBMCheckXY, _LGBMCheckArray, _LGBMCheckSampleWeight,
13
                     _LGBMAssertAllFinite, _LGBMCheckClassificationTargets, _LGBMComputeSampleWeight,
14
                     argc_, range_, zip_, string_type, DataFrame, DataTable)
wxchan's avatar
wxchan committed
15
from .engine import train
16

wxchan's avatar
wxchan committed
17

18
19
class _ObjectiveFunctionWrapper(object):
    """Proxy class for objective function."""
20

21
22
    def __init__(self, func):
        """Construct a proxy class.
23

24
25
        This class transforms objective function to match objective function with signature ``new_func(preds, dataset)``
        as expected by ``lightgbm.engine.train``.
26

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
        Parameters
        ----------
        func : callable
            Expects a callable with signature ``func(y_true, y_pred)`` or ``func(y_true, y_pred, group)
            and returns (grad, hess):

                y_true : array-like of shape = [n_samples]
                    The target values.
                y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                    The predicted values.
                group : array-like
                    Group/query data, used for ranking task.
                grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                    The value of the first order derivative (gradient) for each sample point.
                hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                    The value of the second order derivative (Hessian) for each sample point.
wxchan's avatar
wxchan committed
43

Nikita Titov's avatar
Nikita Titov committed
44
45
        .. note::

46
            For binary task, the y_pred is margin.
Nikita Titov's avatar
Nikita Titov committed
47
48
49
            For multi-class task, the y_pred is group by class_id first, then group by row_id.
            If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
            and you should group grad and hess in this way as well.
50
51
        """
        self.func = func
wxchan's avatar
wxchan committed
52

53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
    def __call__(self, preds, dataset):
        """Call passed function with appropriate arguments.

        Parameters
        ----------
        preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
            The predicted values.
        dataset : Dataset
            The training dataset.

        Returns
        -------
        grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
            The value of the first order derivative (gradient) for each sample point.
        hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
            The value of the second order derivative (Hessian) for each sample point.
        """
wxchan's avatar
wxchan committed
70
        labels = dataset.get_label()
71
        argc = argc_(self.func)
72
        if argc == 2:
73
            grad, hess = self.func(labels, preds)
74
        elif argc == 3:
75
            grad, hess = self.func(labels, preds, dataset.get_group())
76
        else:
wxchan's avatar
wxchan committed
77
            raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
wxchan's avatar
wxchan committed
78
79
80
81
82
83
84
85
86
87
88
        """weighted for objective"""
        weight = dataset.get_weight()
        if weight is not None:
            """only one class"""
            if len(weight) == len(grad):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
89
                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
wxchan's avatar
wxchan committed
90
91
                for k in range_(num_class):
                    for i in range_(num_data):
wxchan's avatar
wxchan committed
92
93
94
95
96
                        idx = k * num_data + i
                        grad[idx] *= weight[i]
                        hess[idx] *= weight[i]
        return grad, hess

wxchan's avatar
wxchan committed
97

98
99
class _EvalFunctionWrapper(object):
    """Proxy class for evaluation function."""
100

101
102
    def __init__(self, func):
        """Construct a proxy class.
103

104
105
        This class transforms evaluation function to match evaluation function with signature ``new_func(preds, dataset)``
        as expected by ``lightgbm.engine.train``.
106

107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
        Parameters
        ----------
        func : callable
            Expects a callable with following signatures:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
            or ``func(y_true, y_pred, weight, group)``
            and returns (eval_name, eval_result, is_higher_better) or
            list of (eval_name, eval_result, is_higher_better):

                y_true : array-like of shape = [n_samples]
                    The target values.
                y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
                    The predicted values.
                weight : array-like of shape = [n_samples]
                    The weight of samples.
                group : array-like
                    Group/query data, used for ranking task.
                eval_name : string
126
                    The name of evaluation function (without whitespaces).
127
128
129
130
131
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.

Nikita Titov's avatar
Nikita Titov committed
132
133
        .. note::

134
            For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``).
Nikita Titov's avatar
Nikita Titov committed
135
136
            For multi-class task, the y_pred is group by class_id first, then group by row_id.
            If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
137
138
        """
        self.func = func
139

140
141
    def __call__(self, preds, dataset):
        """Call passed function with appropriate arguments.
142

143
144
145
146
147
148
149
150
151
152
        Parameters
        ----------
        preds : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
            The predicted values.
        dataset : Dataset
            The training dataset.

        Returns
        -------
        eval_name : string
153
            The name of evaluation function (without whitespaces).
154
155
156
157
158
        eval_result : float
            The eval result.
        is_higher_better : bool
            Is eval result higher better, e.g. AUC is ``is_higher_better``.
        """
159
        labels = dataset.get_label()
160
        argc = argc_(self.func)
161
        if argc == 2:
162
            return self.func(labels, preds)
163
        elif argc == 3:
164
            return self.func(labels, preds, dataset.get_weight())
165
        elif argc == 4:
166
            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())
167
        else:
wxchan's avatar
wxchan committed
168
            raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
169

wxchan's avatar
wxchan committed
170

171
172
class LGBMModel(_LGBMModelBase):
    """Implementation of the scikit-learn API for LightGBM."""
wxchan's avatar
wxchan committed
173

174
    def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1,
175
                 learning_rate=0.1, n_estimators=100,
176
                 subsample_for_bin=200000, objective=None, class_weight=None,
177
                 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
178
                 subsample=1., subsample_freq=0, colsample_bytree=1.,
179
                 reg_alpha=0., reg_lambda=0., random_state=None,
180
                 n_jobs=-1, silent=True, importance_type='split', **kwargs):
181
        r"""Construct a gradient boosting model.
wxchan's avatar
wxchan committed
182
183
184

        Parameters
        ----------
185
        boosting_type : string, optional (default='gbdt')
186
187
188
189
190
            'gbdt', traditional Gradient Boosting Decision Tree.
            'dart', Dropouts meet Multiple Additive Regression Trees.
            'goss', Gradient-based One-Side Sampling.
            'rf', Random Forest.
        num_leaves : int, optional (default=31)
wxchan's avatar
wxchan committed
191
            Maximum tree leaves for base learners.
192
        max_depth : int, optional (default=-1)
193
            Maximum tree depth for base learners, <=0 means no limit.
194
        learning_rate : float, optional (default=0.1)
195
            Boosting learning rate.
196
197
198
            You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
            in training using ``reset_parameter`` callback.
            Note, that this will ignore the ``learning_rate`` argument in training.
199
        n_estimators : int, optional (default=100)
wxchan's avatar
wxchan committed
200
            Number of boosted trees to fit.
201
        subsample_for_bin : int, optional (default=200000)
wxchan's avatar
wxchan committed
202
            Number of samples for constructing bins.
203
        objective : string, callable or None, optional (default=None)
wxchan's avatar
wxchan committed
204
205
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
206
            Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
207
208
209
210
        class_weight : dict, 'balanced' or None, optional (default=None)
            Weights associated with classes in the form ``{class_label: weight}``.
            Use this parameter only for multi-class classification task;
            for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
211
212
213
            Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities.
            You may want to consider performing probability calibration
            (https://scikit-learn.org/stable/modules/calibration.html) of your model.
214
215
216
            The 'balanced' mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
            If None, all classes are supposed to have weight one.
217
            Note, that these weights will be multiplied with ``sample_weight`` (passed through the ``fit`` method)
218
            if ``sample_weight`` is specified.
219
        min_split_gain : float, optional (default=0.)
wxchan's avatar
wxchan committed
220
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
221
        min_child_weight : float, optional (default=1e-3)
222
            Minimum sum of instance weight (hessian) needed in a child (leaf).
223
        min_child_samples : int, optional (default=20)
224
            Minimum number of data needed in a child (leaf).
225
        subsample : float, optional (default=1.)
wxchan's avatar
wxchan committed
226
            Subsample ratio of the training instance.
227
        subsample_freq : int, optional (default=0)
228
229
            Frequence of subsample, <=0 means no enable.
        colsample_bytree : float, optional (default=1.)
wxchan's avatar
wxchan committed
230
            Subsample ratio of columns when constructing each tree.
231
        reg_alpha : float, optional (default=0.)
232
            L1 regularization term on weights.
233
        reg_lambda : float, optional (default=0.)
234
            L2 regularization term on weights.
235
        random_state : int, RandomState object or None, optional (default=None)
wxchan's avatar
wxchan committed
236
            Random number seed.
237
238
239
            If int, this number is used to seed the C++ code.
            If RandomState object (numpy), a random integer is picked based on its state to seed the C++ code.
            If None, default seeds in C++ code are used.
240
        n_jobs : int, optional (default=-1)
241
            Number of parallel threads.
242
        silent : bool, optional (default=True)
wxchan's avatar
wxchan committed
243
            Whether to print messages while running boosting.
244
        importance_type : string, optional (default='split')
245
            The type of feature importance to be filled into ``feature_importances_``.
246
247
248
249
            If 'split', result contains numbers of times the feature is used in a model.
            If 'gain', result contains total gains of splits which use the feature.
        **kwargs
            Other parameters for the model.
wxchan's avatar
wxchan committed
250
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
251

Nikita Titov's avatar
Nikita Titov committed
252
253
254
            .. warning::

                \*\*kwargs is not supported in sklearn, it may cause unexpected issues.
wxchan's avatar
wxchan committed
255

256
257
        Attributes
        ----------
258
259
        n_features_in_ : int
            The number of features of fitted model.
260

wxchan's avatar
wxchan committed
261
262
        Note
        ----
263
264
        A custom objective function can be provided for the ``objective`` parameter.
        In this case, it should have the signature
265
266
        ``objective(y_true, y_pred) -> grad, hess`` or
        ``objective(y_true, y_pred, group) -> grad, hess``:
wxchan's avatar
wxchan committed
267

Nikita Titov's avatar
Nikita Titov committed
268
            y_true : array-like of shape = [n_samples]
269
                The target values.
Nikita Titov's avatar
Nikita Titov committed
270
            y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
271
                The predicted values.
Nikita Titov's avatar
Nikita Titov committed
272
            group : array-like
273
                Group/query data, used for ranking task.
Nikita Titov's avatar
Nikita Titov committed
274
            grad : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
275
                The value of the first order derivative (gradient) for each sample point.
Nikita Titov's avatar
Nikita Titov committed
276
            hess : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
277
                The value of the second order derivative (Hessian) for each sample point.
wxchan's avatar
wxchan committed
278

279
        For binary task, the y_pred is margin.
280
281
282
        For multi-class task, the y_pred is group by class_id first, then group by row_id.
        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i]
        and you should group grad and hess in this way as well.
wxchan's avatar
wxchan committed
283
        """
wxchan's avatar
wxchan committed
284
        if not SKLEARN_INSTALLED:
285
            raise LightGBMError('Scikit-learn is required for this module')
wxchan's avatar
wxchan committed
286

287
        self.boosting_type = boosting_type
288
        self.objective = objective
wxchan's avatar
wxchan committed
289
290
291
292
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
wxchan's avatar
wxchan committed
293
        self.subsample_for_bin = subsample_for_bin
wxchan's avatar
wxchan committed
294
295
296
297
298
299
300
301
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
302
303
        self.random_state = random_state
        self.n_jobs = n_jobs
wxchan's avatar
wxchan committed
304
        self.silent = silent
305
        self.importance_type = importance_type
wxchan's avatar
wxchan committed
306
        self._Booster = None
307
308
309
310
        self._evals_result = None
        self._best_score = None
        self._best_iteration = None
        self._other_params = {}
311
        self._objective = objective
312
        self.class_weight = class_weight
313
314
        self._class_weight = None
        self._class_map = None
315
316
317
        self._n_features = None
        self._classes = None
        self._n_classes = None
318
        self.set_params(**kwargs)
wxchan's avatar
wxchan committed
319

Nikita Titov's avatar
Nikita Titov committed
320
321
322
323
    def _more_tags(self):
        return {'allow_nan': True,
                'X_types': ['2darray', 'sparse', '1dlabels']}

wxchan's avatar
wxchan committed
324
    def get_params(self, deep=True):
325
326
327
328
329
330
331
332
333
334
335
336
337
        """Get parameters for this estimator.

        Parameters
        ----------
        deep : bool, optional (default=True)
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.

        Returns
        -------
        params : dict
            Parameter names mapped to their values.
        """
wxchan's avatar
wxchan committed
338
        params = super(LGBMModel, self).get_params(deep=deep)
339
        params.update(self._other_params)
wxchan's avatar
wxchan committed
340
341
342
        return params

    def set_params(self, **params):
343
344
345
346
347
348
349
350
351
352
353
354
        """Set the parameters of this estimator.

        Parameters
        ----------
        **params
            Parameter names with their new values.

        Returns
        -------
        self : object
            Returns self.
        """
wxchan's avatar
wxchan committed
355
356
        for key, value in params.items():
            setattr(self, key, value)
357
358
            if hasattr(self, '_' + key):
                setattr(self, '_' + key, value)
359
            self._other_params[key] = value
wxchan's avatar
wxchan committed
360
        return self
wxchan's avatar
wxchan committed
361

Guolin Ke's avatar
Guolin Ke committed
362
    def fit(self, X, y,
363
            sample_weight=None, init_score=None, group=None,
364
            eval_set=None, eval_names=None, eval_sample_weight=None,
365
366
            eval_class_weight=None, eval_init_score=None, eval_group=None,
            eval_metric=None, early_stopping_rounds=None, verbose=True,
367
368
            feature_name='auto', categorical_feature='auto',
            callbacks=None, init_model=None):
369
        """Build a gradient boosting model from the training set (X, y).
wxchan's avatar
wxchan committed
370
371
372

        Parameters
        ----------
373
374
375
376
377
378
379
380
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix.
        y : array-like of shape = [n_samples]
            The target values (class labels in classification, real numbers in regression).
        sample_weight : array-like of shape = [n_samples] or None, optional (default=None)
            Weights of training data.
        init_score : array-like of shape = [n_samples] or None, optional (default=None)
            Init score of training data.
381
        group : array-like or None, optional (default=None)
382
383
            Group data of training data.
        eval_set : list or None, optional (default=None)
384
            A list of (X, y) tuple pairs to use as validation sets.
385
        eval_names : list of strings or None, optional (default=None)
386
387
388
            Names of eval_set.
        eval_sample_weight : list of arrays or None, optional (default=None)
            Weights of eval data.
389
390
        eval_class_weight : list or None, optional (default=None)
            Class weights of eval data.
391
392
393
394
395
396
        eval_init_score : list of arrays or None, optional (default=None)
            Init score of eval data.
        eval_group : list of arrays or None, optional (default=None)
            Group data of eval data.
        eval_metric : string, list of strings, callable or None, optional (default=None)
            If string, it should be a built-in evaluation metric to use.
397
            If callable, it should be a custom evaluation metric, see note below for more details.
Misha Lisovyi's avatar
Misha Lisovyi committed
398
            In either case, the ``metric`` from the model parameters will be evaluated and used as well.
399
            Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker.
400
401
        early_stopping_rounds : int or None, optional (default=None)
            Activates early stopping. The model will train until the validation score stops improving.
402
            Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
403
            to continue training.
404
405
            Requires at least one validation data and one metric.
            If there's more than one, will check all of them. But the training data is ignored anyway.
406
407
            To check only the first metric, set the ``first_metric_only`` parameter to ``True``
            in additional parameters ``**kwargs`` of the model constructor.
408
409
410
411
412
413
        verbose : bool or int, optional (default=True)
            Requires at least one evaluation data.
            If True, the eval metric on the eval set is printed at each boosting stage.
            If int, the eval metric on the eval set is printed at every ``verbose`` boosting stage.
            The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed.

Nikita Titov's avatar
Nikita Titov committed
414
415
            .. rubric:: Example

416
417
418
            With ``verbose`` = 4 and at least one item in ``eval_set``,
            an evaluation metric is printed every 4 (instead of 1) boosting stages.

419
        feature_name : list of strings or 'auto', optional (default='auto')
420
421
            Feature names.
            If 'auto' and data is pandas DataFrame, data columns names are used.
422
        categorical_feature : list of strings or int, or 'auto', optional (default='auto')
423
424
            Categorical features.
            If list of int, interpreted as indices.
425
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
426
            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
427
            All values in categorical features should be less than int32 max value (2147483647).
428
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
429
            All negative values in categorical features will be treated as missing values.
430
            The output cannot be monotonically constrained with respect to a categorical feature.
431
        callbacks : list of callback functions or None, optional (default=None)
432
            List of callback functions that are applied at each iteration.
433
            See Callbacks in Python API for more information.
434
435
        init_model : string, Booster, LGBMModel or None, optional (default=None)
            Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training.
436

437
438
439
440
441
        Returns
        -------
        self : object
            Returns self.

442
443
        Note
        ----
444
        Custom eval function expects a callable with following signatures:
445
        ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
446
        ``func(y_true, y_pred, weight, group)``
447
448
        and returns (eval_name, eval_result, is_higher_better) or
        list of (eval_name, eval_result, is_higher_better):
449

Nikita Titov's avatar
Nikita Titov committed
450
            y_true : array-like of shape = [n_samples]
451
                The target values.
452
            y_pred : array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task)
453
                The predicted values.
Nikita Titov's avatar
Nikita Titov committed
454
            weight : array-like of shape = [n_samples]
455
                The weight of samples.
Nikita Titov's avatar
Nikita Titov committed
456
            group : array-like
457
                Group/query data, used for ranking task.
Nikita Titov's avatar
Nikita Titov committed
458
            eval_name : string
459
                The name of evaluation function (without whitespaces).
Nikita Titov's avatar
Nikita Titov committed
460
            eval_result : float
461
                The eval result.
462
463
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.
464

465
        For binary task, the y_pred is probability of positive class (or margin in case of custom ``objective``).
466
467
        For multi-class task, the y_pred is group by class_id first, then group by row_id.
        If you want to get i-th row y_pred in j-th class, the access way is y_pred[j * num_data + i].
wxchan's avatar
wxchan committed
468
        """
469
470
471
472
473
474
475
476
477
478
        if self._objective is None:
            if isinstance(self, LGBMRegressor):
                self._objective = "regression"
            elif isinstance(self, LGBMClassifier):
                self._objective = "binary"
            elif isinstance(self, LGBMRanker):
                self._objective = "lambdarank"
            else:
                raise ValueError("Unknown LGBMModel type.")
        if callable(self._objective):
479
            self._fobj = _ObjectiveFunctionWrapper(self._objective)
480
481
        else:
            self._fobj = None
wxchan's avatar
wxchan committed
482
483
        evals_result = {}
        params = self.get_params()
wxchan's avatar
wxchan committed
484
        # user can set verbose with kwargs, it has higher priority
485
        if not any(verbose_alias in params for verbose_alias in _ConfigAliases.get("verbosity")) and self.silent:
486
            params['verbose'] = -1
wxchan's avatar
wxchan committed
487
        params.pop('silent', None)
488
        params.pop('importance_type', None)
wxchan's avatar
wxchan committed
489
        params.pop('n_estimators', None)
490
        params.pop('class_weight', None)
491
492
        if isinstance(params['random_state'], np.random.RandomState):
            params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
493
494
        for alias in _ConfigAliases.get('objective'):
            params.pop(alias, None)
495
        if self._n_classes is not None and self._n_classes > 2:
496
497
            for alias in _ConfigAliases.get('num_class'):
                params.pop(alias, None)
498
499
            params['num_class'] = self._n_classes
        if hasattr(self, '_eval_at'):
500
501
            for alias in _ConfigAliases.get('eval_at'):
                params.pop(alias, None)
502
            params['eval_at'] = self._eval_at
503
504
        params['objective'] = self._objective
        if self._fobj:
wxchan's avatar
wxchan committed
505
            params['objective'] = 'None'  # objective = nullptr for unknown objective
wxchan's avatar
wxchan committed
506
507

        if callable(eval_metric):
508
            feval = _EvalFunctionWrapper(eval_metric)
wxchan's avatar
wxchan committed
509
510
        else:
            feval = None
511
512
513
514
515
516
517
518
519
520
521
            # register default metric for consistency with callable eval_metric case
            original_metric = self._objective if isinstance(self._objective, string_type) else None
            if original_metric is None:
                # try to deduce from class instance
                if isinstance(self, LGBMRegressor):
                    original_metric = "l2"
                elif isinstance(self, LGBMClassifier):
                    original_metric = "multi_logloss" if self._n_classes > 2 else "binary_logloss"
                elif isinstance(self, LGBMRanker):
                    original_metric = "ndcg"
            # overwrite default metric by explicitly set metric
522
            for metric_alias in _ConfigAliases.get("metric"):
523
524
525
526
527
                if metric_alias in params:
                    original_metric = params.pop(metric_alias)
            # concatenate metric from params (or default if not provided in params) and eval_metric
            original_metric = [original_metric] if isinstance(original_metric, (string_type, type(None))) else original_metric
            eval_metric = [eval_metric] if isinstance(eval_metric, (string_type, type(None))) else eval_metric
528
529
            params['metric'] = [e for e in eval_metric if e not in original_metric] + original_metric
            params['metric'] = [metric for metric in params['metric'] if metric is not None]
wxchan's avatar
wxchan committed
530

531
        if not isinstance(X, (DataFrame, DataTable)):
532
            _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
533
534
            if sample_weight is not None:
                sample_weight = _LGBMCheckSampleWeight(sample_weight, _X)
535
536
        else:
            _X, _y = X, y
537

538
539
540
541
        if self._class_weight is None:
            self._class_weight = self.class_weight
        if self._class_weight is not None:
            class_sample_weight = _LGBMComputeSampleWeight(self._class_weight, y)
542
543
544
545
            if sample_weight is None or len(sample_weight) == 0:
                sample_weight = class_sample_weight
            else:
                sample_weight = np.multiply(sample_weight, class_sample_weight)
546

547
        self._n_features = _X.shape[1]
548
549
        # set public attribute for consistency
        self.n_features_in_ = self._n_features
550

551
552
        def _construct_dataset(X, y, sample_weight, init_score, group, params,
                               categorical_feature='auto'):
553
            return Dataset(X, label=y, weight=sample_weight, group=group,
554
555
                           init_score=init_score, params=params,
                           categorical_feature=categorical_feature)
Guolin Ke's avatar
Guolin Ke committed
556

557
558
        train_set = _construct_dataset(_X, _y, sample_weight, init_score, group, params,
                                       categorical_feature=categorical_feature)
Guolin Ke's avatar
Guolin Ke committed
559
560
561

        valid_sets = []
        if eval_set is not None:
562

563
            def _get_meta_data(collection, name, i):
564
565
566
567
568
569
570
                if collection is None:
                    return None
                elif isinstance(collection, list):
                    return collection[i] if len(collection) > i else None
                elif isinstance(collection, dict):
                    return collection.get(i, None)
                else:
571
                    raise TypeError('{} should be dict or list'.format(name))
572

Guolin Ke's avatar
Guolin Ke committed
573
574
575
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
576
                # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
577
578
579
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
580
581
582
583
584
585
                    valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
                    valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
                    if valid_class_weight is not None:
                        if isinstance(valid_class_weight, dict) and self._class_map is not None:
                            valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
                        valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
586
587
588
589
                        if valid_weight is None or len(valid_weight) == 0:
                            valid_weight = valid_class_sample_weight
                        else:
                            valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
590
591
                    valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
                    valid_group = _get_meta_data(eval_group, 'eval_group', i)
592
593
                    valid_set = _construct_dataset(valid_data[0], valid_data[1],
                                                   valid_weight, valid_init_score, valid_group, params)
Guolin Ke's avatar
Guolin Ke committed
594
595
                valid_sets.append(valid_set)

596
597
598
        if isinstance(init_model, LGBMModel):
            init_model = init_model.booster_

Guolin Ke's avatar
Guolin Ke committed
599
        self._Booster = train(params, train_set,
600
                              self.n_estimators, valid_sets=valid_sets, valid_names=eval_names,
wxchan's avatar
wxchan committed
601
                              early_stopping_rounds=early_stopping_rounds,
602
                              evals_result=evals_result, fobj=self._fobj, feval=feval,
Guolin Ke's avatar
Guolin Ke committed
603
                              verbose_eval=verbose, feature_name=feature_name,
604
                              callbacks=callbacks, init_model=init_model)
wxchan's avatar
wxchan committed
605
606

        if evals_result:
607
            self._evals_result = evals_result
wxchan's avatar
wxchan committed
608
609

        if early_stopping_rounds is not None:
610
            self._best_iteration = self._Booster.best_iteration
611
612

        self._best_score = self._Booster.best_score
wxchan's avatar
wxchan committed
613
614

        # free dataset
615
        self._Booster.free_dataset()
wxchan's avatar
wxchan committed
616
        del train_set, valid_sets
wxchan's avatar
wxchan committed
617
618
        return self

619
    def predict(self, X, raw_score=False, num_iteration=None,
620
                pred_leaf=False, pred_contrib=False, **kwargs):
621
        """Return the predicted value for each sample.
wxchan's avatar
wxchan committed
622
623
624

        Parameters
        ----------
625
        X : array-like or sparse matrix of shape = [n_samples, n_features]
wxchan's avatar
wxchan committed
626
            Input features matrix.
627
628
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
629
        num_iteration : int or None, optional (default=None)
630
            Limit number of iterations in the prediction.
631
632
            If None, if the best iteration exists, it is used; otherwise, all trees are used.
            If <= 0, all trees are used (no limits).
633
634
635
636
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
637

Nikita Titov's avatar
Nikita Titov committed
638
639
640
641
642
643
644
            .. note::

                If you want to get more explanations for your model's predictions using SHAP values,
                like SHAP interaction values,
                you can install the shap package (https://github.com/slundberg/shap).
                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
                column, where the last column is the expected value.
645

646
647
        **kwargs
            Other parameters for the prediction.
wxchan's avatar
wxchan committed
648
649
650

        Returns
        -------
651
652
        predicted_result : array-like of shape = [n_samples] or shape = [n_samples, n_classes]
            The predicted values.
653
        X_leaves : array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]
Nikita Titov's avatar
Nikita Titov committed
654
            If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
655
656
        X_SHAP_values : array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes]
            If ``pred_contrib=True``, the feature contributions for each sample.
wxchan's avatar
wxchan committed
657
        """
658
659
        if self._n_features is None:
            raise LGBMNotFittedError("Estimator not fitted, call `fit` before exploiting the model.")
660
        if not isinstance(X, (DataFrame, DataTable)):
661
            X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
662
663
664
665
666
667
        n_features = X.shape[1]
        if self._n_features != n_features:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features_ is %s and "
                             "input n_features is %s "
                             % (self._n_features, n_features))
668
        return self._Booster.predict(X, raw_score=raw_score, num_iteration=num_iteration,
669
                                     pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs)
wxchan's avatar
wxchan committed
670

671
672
    @property
    def n_features_(self):
673
        """:obj:`int`: The number of features of fitted model."""
674
675
676
677
678
679
        if self._n_features is None:
            raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.')
        return self._n_features

    @property
    def best_score_(self):
680
        """:obj:`dict` or :obj:`None`: The best score of fitted model."""
681
682
683
684
685
686
        if self._n_features is None:
            raise LGBMNotFittedError('No best_score found. Need to call fit beforehand.')
        return self._best_score

    @property
    def best_iteration_(self):
687
        """:obj:`int` or :obj:`None`: The best iteration of fitted model if ``early_stopping_rounds`` has been specified."""
688
689
690
691
692
693
        if self._n_features is None:
            raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping_rounds beforehand.')
        return self._best_iteration

    @property
    def objective_(self):
694
        """:obj:`string` or :obj:`callable`: The concrete objective used while fitting this model."""
695
696
697
698
        if self._n_features is None:
            raise LGBMNotFittedError('No objective found. Need to call fit beforehand.')
        return self._objective

699
700
    @property
    def booster_(self):
701
        """Booster: The underlying Booster of this model."""
702
        if self._Booster is None:
703
            raise LGBMNotFittedError('No booster found. Need to call fit beforehand.')
704
        return self._Booster
wxchan's avatar
wxchan committed
705

706
707
    @property
    def evals_result_(self):
708
        """:obj:`dict` or :obj:`None`: The evaluation results if ``early_stopping_rounds`` has been specified."""
709
710
711
        if self._n_features is None:
            raise LGBMNotFittedError('No results found. Need to call fit with eval_set beforehand.')
        return self._evals_result
712
713

    @property
714
    def feature_importances_(self):
715
        """:obj:`array` of shape = [n_features]: The feature importances (the higher, the more important).
716

Nikita Titov's avatar
Nikita Titov committed
717
718
719
720
        .. note::

            ``importance_type`` attribute is passed to the function
            to configure the type of importance values to be extracted.
721
        """
722
723
        if self._n_features is None:
            raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
724
        return self._Booster.feature_importance(importance_type=self.importance_type)
wxchan's avatar
wxchan committed
725

726
727
    @property
    def feature_name_(self):
728
        """:obj:`array` of shape = [n_features]: The names of features."""
729
730
731
732
        if self._n_features is None:
            raise LGBMNotFittedError('No feature_name found. Need to call fit beforehand.')
        return self._Booster.feature_name()

wxchan's avatar
wxchan committed
733

734
735
class LGBMRegressor(LGBMModel, _LGBMRegressorBase):
    """LightGBM regressor."""
wxchan's avatar
wxchan committed
736

Guolin Ke's avatar
Guolin Ke committed
737
738
    def fit(self, X, y,
            sample_weight=None, init_score=None,
739
            eval_set=None, eval_names=None, eval_sample_weight=None,
740
            eval_init_score=None, eval_metric=None, early_stopping_rounds=None,
741
742
            verbose=True, feature_name='auto', categorical_feature='auto',
            callbacks=None, init_model=None):
743
        """Docstring is inherited from the LGBMModel."""
744
745
        super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
                                       init_score=init_score, eval_set=eval_set,
746
                                       eval_names=eval_names,
747
748
749
750
751
                                       eval_sample_weight=eval_sample_weight,
                                       eval_init_score=eval_init_score,
                                       eval_metric=eval_metric,
                                       early_stopping_rounds=early_stopping_rounds,
                                       verbose=verbose, feature_name=feature_name,
752
                                       categorical_feature=categorical_feature,
753
                                       callbacks=callbacks, init_model=init_model)
Guolin Ke's avatar
Guolin Ke committed
754
755
        return self

756
757
758
    _base_doc = LGBMModel.fit.__doc__
    fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')]
                   + _base_doc[_base_doc.find('eval_init_score :'):])
wxchan's avatar
wxchan committed
759

760
761
762

class LGBMClassifier(LGBMModel, _LGBMClassifierBase):
    """LightGBM classifier."""
wxchan's avatar
wxchan committed
763

Guolin Ke's avatar
Guolin Ke committed
764
765
    def fit(self, X, y,
            sample_weight=None, init_score=None,
766
            eval_set=None, eval_names=None, eval_sample_weight=None,
767
            eval_class_weight=None, eval_init_score=None, eval_metric=None,
wxchan's avatar
wxchan committed
768
            early_stopping_rounds=None, verbose=True,
769
770
            feature_name='auto', categorical_feature='auto',
            callbacks=None, init_model=None):
771
        """Docstring is inherited from the LGBMModel."""
772
        _LGBMAssertAllFinite(y)
773
774
        _LGBMCheckClassificationTargets(y)
        self._le = _LGBMLabelEncoder().fit(y)
775
        _y = self._le.transform(y)
776
777
778
        self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))
        if isinstance(self.class_weight, dict):
            self._class_weight = {self._class_map[k]: v for k, v in self.class_weight.items()}
779

780
781
782
        self._classes = self._le.classes_
        self._n_classes = len(self._classes)
        if self._n_classes > 2:
wxchan's avatar
wxchan committed
783
            # Switch to using a multiclass objective in the underlying LGBM instance
784
            ova_aliases = {"multiclassova", "multiclass_ova", "ova", "ovr"}
785
            if self._objective not in ova_aliases and not callable(self._objective):
786
                self._objective = "multiclass"
787
            if eval_metric in {'logloss', 'binary_logloss'}:
wxchan's avatar
wxchan committed
788
                eval_metric = "multi_logloss"
789
            elif eval_metric in {'error', 'binary_error'}:
wxchan's avatar
wxchan committed
790
791
                eval_metric = "multi_error"
        else:
792
            if eval_metric in {'logloss', 'multi_logloss'}:
wxchan's avatar
wxchan committed
793
                eval_metric = 'binary_logloss'
794
            elif eval_metric in {'error', 'multi_error'}:
wxchan's avatar
wxchan committed
795
                eval_metric = 'binary_error'
wxchan's avatar
wxchan committed
796

797
798
        # do not modify args, as it causes errors in model selection tools
        valid_sets = None
wxchan's avatar
wxchan committed
799
        if eval_set is not None:
800
801
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
802
            valid_sets = [None] * len(eval_set)
803
804
            for i, (valid_x, valid_y) in enumerate(eval_set):
                if valid_x is X and valid_y is y:
805
                    valid_sets[i] = (valid_x, _y)
806
                else:
807
                    valid_sets[i] = (valid_x, self._le.transform(valid_y))
808

809
        super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight,
810
                                        init_score=init_score, eval_set=valid_sets,
811
                                        eval_names=eval_names,
812
                                        eval_sample_weight=eval_sample_weight,
813
                                        eval_class_weight=eval_class_weight,
814
815
816
817
                                        eval_init_score=eval_init_score,
                                        eval_metric=eval_metric,
                                        early_stopping_rounds=early_stopping_rounds,
                                        verbose=verbose, feature_name=feature_name,
818
                                        categorical_feature=categorical_feature,
819
                                        callbacks=callbacks, init_model=init_model)
wxchan's avatar
wxchan committed
820
821
        return self

822
    fit.__doc__ = LGBMModel.fit.__doc__
823

824
    def predict(self, X, raw_score=False, num_iteration=None,
825
                pred_leaf=False, pred_contrib=False, **kwargs):
826
        """Docstring is inherited from the LGBMModel."""
827
828
        result = self.predict_proba(X, raw_score, num_iteration,
                                    pred_leaf, pred_contrib, **kwargs)
829
        if callable(self._objective) or raw_score or pred_leaf or pred_contrib:
830
831
832
833
            return result
        else:
            class_index = np.argmax(result, axis=1)
            return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
834

835
836
    predict.__doc__ = LGBMModel.predict.__doc__

837
    def predict_proba(self, X, raw_score=False, num_iteration=None,
838
                      pred_leaf=False, pred_contrib=False, **kwargs):
839
        """Return the predicted probability for each class for each sample.
wxchan's avatar
wxchan committed
840
841
842

        Parameters
        ----------
843
        X : array-like or sparse matrix of shape = [n_samples, n_features]
wxchan's avatar
wxchan committed
844
            Input features matrix.
845
846
        raw_score : bool, optional (default=False)
            Whether to predict raw scores.
847
        num_iteration : int or None, optional (default=None)
848
            Limit number of iterations in the prediction.
849
850
            If None, if the best iteration exists, it is used; otherwise, all trees are used.
            If <= 0, all trees are used (no limits).
851
852
853
854
        pred_leaf : bool, optional (default=False)
            Whether to predict leaf index.
        pred_contrib : bool, optional (default=False)
            Whether to predict feature contributions.
855

Nikita Titov's avatar
Nikita Titov committed
856
857
858
859
860
861
862
            .. note::

                If you want to get more explanations for your model's predictions using SHAP values,
                like SHAP interaction values,
                you can install the shap package (https://github.com/slundberg/shap).
                Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
                column, where the last column is the expected value.
863

864
865
        **kwargs
            Other parameters for the prediction.
wxchan's avatar
wxchan committed
866
867
868

        Returns
        -------
869
870
        predicted_probability : array-like of shape = [n_samples, n_classes]
            The predicted probability for each class for each sample.
871
        X_leaves : array-like of shape = [n_samples, n_trees * n_classes]
872
            If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
873
        X_SHAP_values : array-like of shape = [n_samples, (n_features + 1) * n_classes]
874
            If ``pred_contrib=True``, the feature contributions for each sample.
wxchan's avatar
wxchan committed
875
        """
876
877
        result = super(LGBMClassifier, self).predict(X, raw_score, num_iteration,
                                                     pred_leaf, pred_contrib, **kwargs)
878
879
880
881
882
883
        if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
            warnings.warn("Cannot compute class probabilities or labels "
                          "due to the usage of customized objective function.\n"
                          "Returning raw scores instead.")
            return result
        elif self._n_classes > 2 or raw_score or pred_leaf or pred_contrib:
884
            return result
wxchan's avatar
wxchan committed
885
        else:
886
            return np.vstack((1. - result, result)).transpose()
887
888
889

    @property
    def classes_(self):
890
        """:obj:`array` of shape = [n_classes]: The class label array."""
891
892
893
        if self._classes is None:
            raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
        return self._classes
894
895
896

    @property
    def n_classes_(self):
897
        """:obj:`int`: The number of classes."""
898
899
900
        if self._n_classes is None:
            raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
        return self._n_classes
wxchan's avatar
wxchan committed
901

wxchan's avatar
wxchan committed
902

wxchan's avatar
wxchan committed
903
class LGBMRanker(LGBMModel):
904
    """LightGBM ranker."""
wxchan's avatar
wxchan committed
905

Guolin Ke's avatar
Guolin Ke committed
906
    def fit(self, X, y,
907
            sample_weight=None, init_score=None, group=None,
908
            eval_set=None, eval_names=None, eval_sample_weight=None,
909
            eval_init_score=None, eval_group=None, eval_metric=None,
910
            eval_at=[1, 2, 3, 4, 5], early_stopping_rounds=None, verbose=True,
911
912
            feature_name='auto', categorical_feature='auto',
            callbacks=None, init_model=None):
913
        """Docstring is inherited from the LGBMModel."""
914
        # check group data
Guolin Ke's avatar
Guolin Ke committed
915
        if group is None:
916
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
917
918

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
919
            if eval_group is None:
920
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
921
            elif len(eval_group) != len(eval_set):
922
                raise ValueError("Length of eval_group should be equal to eval_set")
923
924
925
926
            elif (isinstance(eval_group, dict)
                  and any(i not in eval_group or eval_group[i] is None for i in range_(len(eval_group)))
                  or isinstance(eval_group, list)
                  and any(group is None for group in eval_group)):
927
928
                raise ValueError("Should set group for all eval datasets for ranking task; "
                                 "if you use dict, the index should start from 0")
929

930
        self._eval_at = eval_at
931
932
        super(LGBMRanker, self).fit(X, y, sample_weight=sample_weight,
                                    init_score=init_score, group=group,
933
934
                                    eval_set=eval_set, eval_names=eval_names,
                                    eval_sample_weight=eval_sample_weight,
935
936
937
938
                                    eval_init_score=eval_init_score, eval_group=eval_group,
                                    eval_metric=eval_metric,
                                    early_stopping_rounds=early_stopping_rounds,
                                    verbose=verbose, feature_name=feature_name,
939
                                    categorical_feature=categorical_feature,
940
                                    callbacks=callbacks, init_model=init_model)
wxchan's avatar
wxchan committed
941
        return self
942

943
944
945
946
    _base_doc = LGBMModel.fit.__doc__
    fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')]
                   + _base_doc[_base_doc.find('eval_init_score :'):])
    _base_doc = fit.__doc__
947
948
    _before_early_stop, _early_stop, _after_early_stop = _base_doc.partition('early_stopping_rounds :')
    fit.__doc__ = (_before_early_stop
949
                   + 'eval_at : list of int, optional (default=[1, 2, 3, 4, 5])\n'
950
951
                   + ' ' * 12 + 'The evaluation positions of the specified metric.\n'
                   + ' ' * 8 + _early_stop + _after_early_stop)