sklearn.py 58.3 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Scikit-learn wrapper interface for LightGBM."""
3
import copy
4
from inspect import signature
5
from pathlib import Path
6
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
7

wxchan's avatar
wxchan committed
8
import numpy as np
9
import scipy.sparse
10

11
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
12
                    _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
13
                    _LGBM_GroupType, _LGBM_LabelType, _log_warning)
14
from .callback import _EvalResultDict, record_evaluation
15
16
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
                     _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
17
18
                     _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
                     dt_DataTable, pd_DataFrame)
wxchan's avatar
wxchan committed
19
from .engine import train
20

21
22
23
24
25
26
27
__all__ = [
    'LGBMClassifier',
    'LGBMModel',
    'LGBMRanker',
    'LGBMRegressor',
]

28
29
30
31
32
33
34
_LGBM_ScikitMatrixLike = Union[
    dt_DataTable,
    List[Union[List[float], List[int]]],
    np.ndarray,
    pd_DataFrame,
    scipy.sparse.spmatrix
]
35
36
37
_LGBM_ScikitCustomObjectiveFunction = Union[
    Callable[
        [np.ndarray, np.ndarray],
38
        Tuple[np.ndarray, np.ndarray]
39
40
41
    ],
    Callable[
        [np.ndarray, np.ndarray, np.ndarray],
42
        Tuple[np.ndarray, np.ndarray]
43
    ],
44
45
46
47
    Callable[
        [np.ndarray, np.ndarray, np.ndarray, np.ndarray],
        Tuple[np.ndarray, np.ndarray]
    ],
48
49
50
51
]
_LGBM_ScikitCustomEvalFunction = Union[
    Callable[
        [np.ndarray, np.ndarray],
52
        Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]
53
54
55
    ],
    Callable[
        [np.ndarray, np.ndarray, np.ndarray],
56
        Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]
57
58
59
    ],
    Callable[
        [np.ndarray, np.ndarray, np.ndarray, np.ndarray],
60
        Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]
61
62
    ],
]
63
64
65
66
67
_LGBM_ScikitEvalMetricType = Union[
    str,
    _LGBM_ScikitCustomEvalFunction,
    List[Union[str, _LGBM_ScikitCustomEvalFunction]]
]
68

wxchan's avatar
wxchan committed
69

70
class _ObjectiveFunctionWrapper:
71
    """Proxy class for objective function."""
72

73
    def __init__(self, func: _LGBM_ScikitCustomObjectiveFunction):
74
        """Construct a proxy class.
75

76
77
        This class transforms objective function to match objective function with signature ``new_func(preds, dataset)``
        as expected by ``lightgbm.engine.train``.
78

79
80
81
        Parameters
        ----------
        func : callable
82
83
84
85
            Expects a callable with following signatures:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
            or ``func(y_true, y_pred, weight, group)``
86
87
            and returns (grad, hess):

88
                y_true : numpy 1-D array of shape = [n_samples]
89
                    The target values.
90
                y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
91
                    The predicted values.
92
93
                    Predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task.
94
95
                weight : numpy 1-D array of shape = [n_samples]
                    The weight of samples. Weights should be non-negative.
96
                group : numpy 1-D array
97
98
99
                    Group/query data.
                    Only used in the learning-to-rank task.
                    sum(group) = n_samples.
100
101
                    For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                    where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
102
                grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape [n_samples, n_classes] (for multi-class task)
103
104
                    The value of the first order derivative (gradient) of the loss
                    with respect to the elements of y_pred for each sample point.
105
                hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
106
107
                    The value of the second order derivative (Hessian) of the loss
                    with respect to the elements of y_pred for each sample point.
wxchan's avatar
wxchan committed
108

Nikita Titov's avatar
Nikita Titov committed
109
110
        .. note::

111
            For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
112
            and grad and hess should be returned in the same format.
113
114
        """
        self.func = func
wxchan's avatar
wxchan committed
115

116
    def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.ndarray]:
117
118
119
120
        """Call passed function with appropriate arguments.

        Parameters
        ----------
121
        preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
122
123
124
125
126
127
            The predicted values.
        dataset : Dataset
            The training dataset.

        Returns
        -------
128
        grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
129
130
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
131
        hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
132
133
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.
134
        """
wxchan's avatar
wxchan committed
135
        labels = dataset.get_label()
136
        argc = len(signature(self.func).parameters)
137
        if argc == 2:
138
            grad, hess = self.func(labels, preds)
139
        elif argc == 3:
140
141
142
            grad, hess = self.func(labels, preds, dataset.get_weight())
        elif argc == 4:
            grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group())
143
        else:
144
            raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
wxchan's avatar
wxchan committed
145
146
        return grad, hess

wxchan's avatar
wxchan committed
147

148
class _EvalFunctionWrapper:
149
    """Proxy class for evaluation function."""
150

151
    def __init__(self, func: _LGBM_ScikitCustomEvalFunction):
152
        """Construct a proxy class.
153

154
155
        This class transforms evaluation function to match evaluation function with signature ``new_func(preds, dataset)``
        as expected by ``lightgbm.engine.train``.
156

157
158
159
160
161
162
163
164
165
166
        Parameters
        ----------
        func : callable
            Expects a callable with following signatures:
            ``func(y_true, y_pred)``,
            ``func(y_true, y_pred, weight)``
            or ``func(y_true, y_pred, weight, group)``
            and returns (eval_name, eval_result, is_higher_better) or
            list of (eval_name, eval_result, is_higher_better):

167
                y_true : numpy 1-D array of shape = [n_samples]
168
                    The target values.
169
                y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array shape = [n_samples, n_classes] (for multi-class task)
170
                    The predicted values.
171
172
                    In case of custom ``objective``, predicted values are returned before any transformation,
                    e.g. they are raw margin instead of probability of positive class for binary task in this case.
173
                weight : numpy 1-D array of shape = [n_samples]
174
                    The weight of samples. Weights should be non-negative.
175
                group : numpy 1-D array
176
177
178
                    Group/query data.
                    Only used in the learning-to-rank task.
                    sum(group) = n_samples.
179
180
                    For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                    where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
181
                eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
182
                    The name of evaluation function (without whitespace).
183
184
185
186
187
188
                eval_result : float
                    The eval result.
                is_higher_better : bool
                    Is eval result higher better, e.g. AUC is ``is_higher_better``.
        """
        self.func = func
189

190
191
192
193
194
    def __call__(
        self,
        preds: np.ndarray,
        dataset: Dataset
    ) -> Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]:
195
        """Call passed function with appropriate arguments.
196

197
198
        Parameters
        ----------
199
        preds : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
200
201
202
203
204
205
            The predicted values.
        dataset : Dataset
            The training dataset.

        Returns
        -------
206
        eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
207
            The name of evaluation function (without whitespace).
208
209
210
211
212
        eval_result : float
            The eval result.
        is_higher_better : bool
            Is eval result higher better, e.g. AUC is ``is_higher_better``.
        """
213
        labels = dataset.get_label()
214
        argc = len(signature(self.func).parameters)
215
        if argc == 2:
216
            return self.func(labels, preds)
217
        elif argc == 3:
218
            return self.func(labels, preds, dataset.get_weight())
219
        elif argc == 4:
220
            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())
221
        else:
222
            raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
223

wxchan's avatar
wxchan committed
224

225
226
227
228
229
230
231
232
233
234
235
236
237
238
# documentation templates for LGBMModel methods are shared between the classes in
# this module and those in the ``dask`` module

_lgbmmodel_doc_fit = (
    """
    Build a gradient boosting model from the training set (X, y).

    Parameters
    ----------
    X : {X_shape}
        Input feature matrix.
    y : {y_shape}
        The target values (class labels in classification, real numbers in regression).
    sample_weight : {sample_weight_shape}
239
        Weights of training data. Weights should be non-negative.
240
    init_score : {init_score_shape}
241
242
243
244
245
246
247
248
249
        Init score of training data.
    group : {group_shape}
        Group/query data.
        Only used in the learning-to-rank task.
        sum(group) = n_samples.
        For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
        where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
    eval_set : list or None, optional (default=None)
        A list of (X, y) tuple pairs to use as validation sets.
250
    eval_names : list of str, or None, optional (default=None)
251
        Names of eval_set.
252
    eval_sample_weight : {eval_sample_weight_shape}
253
        Weights of eval data. Weights should be non-negative.
254
255
    eval_class_weight : list or None, optional (default=None)
        Class weights of eval data.
256
    eval_init_score : {eval_init_score_shape}
257
        Init score of eval data.
258
    eval_group : {eval_group_shape}
259
        Group data of eval data.
260
261
    eval_metric : str, callable, list or None, optional (default=None)
        If str, it should be a built-in evaluation metric to use.
262
263
264
265
        If callable, it should be a custom evaluation metric, see note below for more details.
        If list, it can be a list of built-in metrics, a list of custom evaluation metrics, or a mix of both.
        In either case, the ``metric`` from the model parameters will be evaluated and used as well.
        Default: 'l2' for LGBMRegressor, 'logloss' for LGBMClassifier, 'ndcg' for LGBMRanker.
266
    feature_name : list of str, or 'auto', optional (default='auto')
267
268
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
269
    categorical_feature : list of str or int, or 'auto', optional (default='auto')
270
271
        Categorical features.
        If list of int, interpreted as indices.
272
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
273
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
274
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
275
276
277
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
        All negative values in categorical features will be treated as missing values.
        The output cannot be monotonically constrained with respect to a categorical feature.
278
        Floating point numbers in categorical features will be rounded towards 0.
279
    callbacks : list of callable, or None, optional (default=None)
280
281
        List of callback functions that are applied at each iteration.
        See Callbacks in Python API for more information.
282
    init_model : str, pathlib.Path, Booster, LGBMModel or None, optional (default=None)
283
284
285
286
        Filename of LightGBM model, Booster instance or LGBMModel instance used for continue training.

    Returns
    -------
287
    self : LGBMModel
288
289
290
291
292
293
294
295
296
297
298
299
300
        Returns self.
    """
)

_lgbmmodel_doc_custom_eval_note = """
    Note
    ----
    Custom eval function expects a callable with following signatures:
    ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` or
    ``func(y_true, y_pred, weight, group)``
    and returns (eval_name, eval_result, is_higher_better) or
    list of (eval_name, eval_result, is_higher_better):

301
        y_true : numpy 1-D array of shape = [n_samples]
302
            The target values.
303
        y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
304
            The predicted values.
305
306
            In case of custom ``objective``, predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task in this case.
307
        weight : numpy 1-D array of shape = [n_samples]
308
            The weight of samples. Weights should be non-negative.
309
        group : numpy 1-D array
310
311
312
313
314
            Group/query data.
            Only used in the learning-to-rank task.
            sum(group) = n_samples.
            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
315
        eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
316
            The name of evaluation function (without whitespace).
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
        eval_result : float
            The eval result.
        is_higher_better : bool
            Is eval result higher better, e.g. AUC is ``is_higher_better``.
"""

_lgbmmodel_doc_predict = (
    """
    {description}

    Parameters
    ----------
    X : {X_shape}
        Input features matrix.
    raw_score : bool, optional (default=False)
        Whether to predict raw scores.
    start_iteration : int, optional (default=0)
        Start index of the iteration to predict.
        If <= 0, starts from the first iteration.
    num_iteration : int or None, optional (default=None)
        Total number of iterations used in the prediction.
        If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
        otherwise, all iterations from ``start_iteration`` are used (no limits).
        If <= 0, all iterations from ``start_iteration`` are used (no limits).
    pred_leaf : bool, optional (default=False)
        Whether to predict leaf index.
    pred_contrib : bool, optional (default=False)
        Whether to predict feature contributions.

        .. note::

            If you want to get more explanations for your model's predictions using SHAP values,
            like SHAP interaction values,
            you can install the shap package (https://github.com/slundberg/shap).
            Note that unlike the shap package, with ``pred_contrib`` we return a matrix with an extra
            column, where the last column is the expected value.

354
355
356
    validate_features : bool, optional (default=False)
        If True, ensure that the features used to predict match the ones used to train.
        Used only if data is pandas DataFrame.
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
    **kwargs
        Other parameters for the prediction.

    Returns
    -------
    {output_name} : {predicted_result_shape}
        The predicted values.
    X_leaves : {X_leaves_shape}
        If ``pred_leaf=True``, the predicted leaf of every tree for each sample.
    X_SHAP_values : {X_SHAP_values_shape}
        If ``pred_contrib=True``, the feature contributions for each sample.
    """
)


372
373
class LGBMModel(_LGBMModelBase):
    """Implementation of the scikit-learn API for LightGBM."""
wxchan's avatar
wxchan committed
374

375
376
377
378
379
380
381
382
    def __init__(
        self,
        boosting_type: str = 'gbdt',
        num_leaves: int = 31,
        max_depth: int = -1,
        learning_rate: float = 0.1,
        n_estimators: int = 100,
        subsample_for_bin: int = 200000,
383
        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
384
385
386
387
388
389
390
391
392
393
        class_weight: Optional[Union[Dict, str]] = None,
        min_split_gain: float = 0.,
        min_child_weight: float = 1e-3,
        min_child_samples: int = 20,
        subsample: float = 1.,
        subsample_freq: int = 0,
        colsample_bytree: float = 1.,
        reg_alpha: float = 0.,
        reg_lambda: float = 0.,
        random_state: Optional[Union[int, np.random.RandomState]] = None,
394
        n_jobs: Optional[int] = None,
395
396
397
        importance_type: str = 'split',
        **kwargs
    ):
398
        r"""Construct a gradient boosting model.
wxchan's avatar
wxchan committed
399
400
401

        Parameters
        ----------
402
        boosting_type : str, optional (default='gbdt')
403
404
405
406
            'gbdt', traditional Gradient Boosting Decision Tree.
            'dart', Dropouts meet Multiple Additive Regression Trees.
            'rf', Random Forest.
        num_leaves : int, optional (default=31)
wxchan's avatar
wxchan committed
407
            Maximum tree leaves for base learners.
408
        max_depth : int, optional (default=-1)
409
            Maximum tree depth for base learners, <=0 means no limit.
410
        learning_rate : float, optional (default=0.1)
411
            Boosting learning rate.
412
413
414
            You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
            in training using ``reset_parameter`` callback.
            Note, that this will ignore the ``learning_rate`` argument in training.
415
        n_estimators : int, optional (default=100)
wxchan's avatar
wxchan committed
416
            Number of boosted trees to fit.
417
        subsample_for_bin : int, optional (default=200000)
wxchan's avatar
wxchan committed
418
            Number of samples for constructing bins.
419
        objective : str, callable or None, optional (default=None)
wxchan's avatar
wxchan committed
420
421
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
422
            Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
423
424
425
426
        class_weight : dict, 'balanced' or None, optional (default=None)
            Weights associated with classes in the form ``{class_label: weight}``.
            Use this parameter only for multi-class classification task;
            for binary classification task you may use ``is_unbalance`` or ``scale_pos_weight`` parameters.
427
428
429
            Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities.
            You may want to consider performing probability calibration
            (https://scikit-learn.org/stable/modules/calibration.html) of your model.
430
431
432
            The 'balanced' mode uses the values of y to automatically adjust weights
            inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.
            If None, all classes are supposed to have weight one.
433
            Note, that these weights will be multiplied with ``sample_weight`` (passed through the ``fit`` method)
434
            if ``sample_weight`` is specified.
435
        min_split_gain : float, optional (default=0.)
wxchan's avatar
wxchan committed
436
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
437
        min_child_weight : float, optional (default=1e-3)
438
            Minimum sum of instance weight (Hessian) needed in a child (leaf).
439
        min_child_samples : int, optional (default=20)
440
            Minimum number of data needed in a child (leaf).
441
        subsample : float, optional (default=1.)
wxchan's avatar
wxchan committed
442
            Subsample ratio of the training instance.
443
        subsample_freq : int, optional (default=0)
Andrew Ziem's avatar
Andrew Ziem committed
444
            Frequency of subsample, <=0 means no enable.
445
        colsample_bytree : float, optional (default=1.)
wxchan's avatar
wxchan committed
446
            Subsample ratio of columns when constructing each tree.
447
        reg_alpha : float, optional (default=0.)
448
            L1 regularization term on weights.
449
        reg_lambda : float, optional (default=0.)
450
            L2 regularization term on weights.
451
        random_state : int, RandomState object or None, optional (default=None)
wxchan's avatar
wxchan committed
452
            Random number seed.
453
454
455
            If int, this number is used to seed the C++ code.
            If RandomState object (numpy), a random integer is picked based on its state to seed the C++ code.
            If None, default seeds in C++ code are used.
456
457
458
459
460
461
462
463
464
465
466
467
        n_jobs : int or None, optional (default=None)
            Number of parallel threads to use for training (can be changed at prediction time by
            passing it as an extra keyword argument).

            For better performance, it is recommended to set this to the number of physical cores
            in the CPU.

            Negative integers are interpreted as following joblib's formula (n_cpus + 1 + n_jobs), just like
            scikit-learn (so e.g. -1 means using all threads). A value of zero corresponds the default number of
            threads configured for OpenMP in the system. A value of ``None`` (the default) corresponds
            to using the number of physical cores in the system (its correct detection requires
            either the ``joblib`` or the ``psutil`` util libraries to be installed).
468
        importance_type : str, optional (default='split')
469
            The type of feature importance to be filled into ``feature_importances_``.
470
471
472
473
            If 'split', result contains numbers of times the feature is used in a model.
            If 'gain', result contains total gains of splits which use the feature.
        **kwargs
            Other parameters for the model.
wxchan's avatar
wxchan committed
474
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
475

Nikita Titov's avatar
Nikita Titov committed
476
477
478
            .. warning::

                \*\*kwargs is not supported in sklearn, it may cause unexpected issues.
wxchan's avatar
wxchan committed
479
480
481

        Note
        ----
482
483
        A custom objective function can be provided for the ``objective`` parameter.
        In this case, it should have the signature
484
485
486
        ``objective(y_true, y_pred) -> grad, hess``,
        ``objective(y_true, y_pred, weight) -> grad, hess``
        or ``objective(y_true, y_pred, weight, group) -> grad, hess``:
wxchan's avatar
wxchan committed
487

488
            y_true : numpy 1-D array of shape = [n_samples]
489
                The target values.
490
            y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
491
                The predicted values.
492
493
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
494
495
            weight : numpy 1-D array of shape = [n_samples]
                The weight of samples. Weights should be non-negative.
496
            group : numpy 1-D array
497
498
499
                Group/query data.
                Only used in the learning-to-rank task.
                sum(group) = n_samples.
500
501
                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
502
            grad : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
503
504
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of y_pred for each sample point.
505
            hess : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
506
507
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of y_pred for each sample point.
wxchan's avatar
wxchan committed
508

509
        For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
510
        and grad and hess should be returned in the same format.
wxchan's avatar
wxchan committed
511
        """
wxchan's avatar
wxchan committed
512
        if not SKLEARN_INSTALLED:
513
514
            raise LightGBMError('scikit-learn is required for lightgbm.sklearn. '
                                'You must install scikit-learn and restart your session to use this module.')
wxchan's avatar
wxchan committed
515

516
        self.boosting_type = boosting_type
517
        self.objective = objective
wxchan's avatar
wxchan committed
518
519
520
521
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
wxchan's avatar
wxchan committed
522
        self.subsample_for_bin = subsample_for_bin
wxchan's avatar
wxchan committed
523
524
525
526
527
528
529
530
        self.min_split_gain = min_split_gain
        self.min_child_weight = min_child_weight
        self.min_child_samples = min_child_samples
        self.subsample = subsample
        self.subsample_freq = subsample_freq
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
531
532
        self.random_state = random_state
        self.n_jobs = n_jobs
533
        self.importance_type = importance_type
534
        self._Booster: Optional[Booster] = None
535
536
        self._evals_result: _EvalResultDict = {}
        self._best_score: _LGBM_BoosterBestScoreType = {}
537
        self._best_iteration: int = -1
538
        self._other_params: Dict[str, Any] = {}
539
        self._objective = objective
540
        self.class_weight = class_weight
541
542
        self._class_weight: Optional[Union[Dict, str]] = None
        self._class_map: Optional[Dict[int, int]] = None
543
544
        self._n_features: int = -1
        self._n_features_in: int = -1
545
        self._classes: Optional[np.ndarray] = None
546
        self._n_classes: int = -1
547
        self.set_params(**kwargs)
wxchan's avatar
wxchan committed
548

549
    def _more_tags(self) -> Dict[str, Any]:
550
551
552
553
554
555
556
557
558
559
        return {
            'allow_nan': True,
            'X_types': ['2darray', 'sparse', '1dlabels'],
            '_xfail_checks': {
                'check_no_attributes_set_in_init':
                'scikit-learn incorrectly asserts that private attributes '
                'cannot be set in __init__: '
                '(see https://github.com/microsoft/LightGBM/issues/2628)'
            }
        }
Nikita Titov's avatar
Nikita Titov committed
560

561
562
563
    def __sklearn_is_fitted__(self) -> bool:
        return getattr(self, "fitted_", False)

564
    def get_params(self, deep: bool = True) -> Dict[str, Any]:
565
566
567
568
569
570
571
572
573
574
575
576
577
        """Get parameters for this estimator.

        Parameters
        ----------
        deep : bool, optional (default=True)
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.

        Returns
        -------
        params : dict
            Parameter names mapped to their values.
        """
578
        params = super().get_params(deep=deep)
579
        params.update(self._other_params)
wxchan's avatar
wxchan committed
580
581
        return params

582
    def set_params(self, **params: Any) -> "LGBMModel":
583
584
585
586
587
588
589
590
591
592
593
594
        """Set the parameters of this estimator.

        Parameters
        ----------
        **params
            Parameter names with their new values.

        Returns
        -------
        self : object
            Returns self.
        """
wxchan's avatar
wxchan committed
595
596
        for key, value in params.items():
            setattr(self, key, value)
597
598
            if hasattr(self, f"_{key}"):
                setattr(self, f"_{key}", value)
599
            self._other_params[key] = value
wxchan's avatar
wxchan committed
600
        return self
wxchan's avatar
wxchan committed
601

602
603
604
605
606
607
608
609
610
611
612
613
614
615
    def _process_params(self, stage: str) -> Dict[str, Any]:
        """Process the parameters of this estimator based on its type, parameter aliases, etc.

        Parameters
        ----------
        stage : str
            Name of the stage (can be ``fit`` or ``predict``) this method is called from.

        Returns
        -------
        processed_params : dict
            Processed parameter names mapped to their values.
        """
        assert stage in {"fit", "predict"}
616
617
618
619
620
        params = self.get_params()

        params.pop('objective', None)
        for alias in _ConfigAliases.get('objective'):
            if alias in params:
621
                obj = params.pop(alias)
622
                _log_warning(f"Found '{alias}' in params. Will use it instead of 'objective' argument")
623
624
625
626
627
628
629
630
631
632
633
634
635
                if stage == "fit":
                    self._objective = obj
        if stage == "fit":
            if self._objective is None:
                if isinstance(self, LGBMRegressor):
                    self._objective = "regression"
                elif isinstance(self, LGBMClassifier):
                    if self._n_classes > 2:
                        self._objective = "multiclass"
                    else:
                        self._objective = "binary"
                elif isinstance(self, LGBMRanker):
                    self._objective = "lambdarank"
636
                else:
637
                    raise ValueError("Unknown LGBMModel type.")
638
        if callable(self._objective):
639
            if stage == "fit":
640
641
642
                params['objective'] = _ObjectiveFunctionWrapper(self._objective)
            else:
                params['objective'] = 'None'
643
        else:
644
            params['objective'] = self._objective
645

646
        params.pop('importance_type', None)
wxchan's avatar
wxchan committed
647
        params.pop('n_estimators', None)
648
        params.pop('class_weight', None)
649

650
651
        if isinstance(params['random_state'], np.random.RandomState):
            params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
652
        if self._n_classes > 2:
653
654
            for alias in _ConfigAliases.get('num_class'):
                params.pop(alias, None)
655
656
            params['num_class'] = self._n_classes
        if hasattr(self, '_eval_at'):
657
            eval_at = self._eval_at
658
            for alias in _ConfigAliases.get('eval_at'):
659
660
661
662
                if alias in params:
                    _log_warning(f"Found '{alias}' in params. Will use it instead of 'eval_at' argument")
                    eval_at = params.pop(alias)
            params['eval_at'] = eval_at
wxchan's avatar
wxchan committed
663

664
        # register default metric for consistency with callable eval_metric case
665
        original_metric = self._objective if isinstance(self._objective, str) else None
666
667
668
669
670
671
672
673
674
675
        if original_metric is None:
            # try to deduce from class instance
            if isinstance(self, LGBMRegressor):
                original_metric = "l2"
            elif isinstance(self, LGBMClassifier):
                original_metric = "multi_logloss" if self._n_classes > 2 else "binary_logloss"
            elif isinstance(self, LGBMRanker):
                original_metric = "ndcg"

        # overwrite default metric by explicitly set metric
676
        params = _choose_param_value("metric", params, original_metric)
677

678
679
680
681
682
683
        # use joblib conventions for negative n_jobs, just like scikit-learn
        # at predict time, this is handled later due to the order of parameter updates
        if stage == "fit":
            params = _choose_param_value("num_threads", params, self.n_jobs)
            params["num_threads"] = self._process_n_jobs(params["num_threads"])

684
685
        return params

686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
    def _process_n_jobs(self, n_jobs: Optional[int]) -> int:
        """Convert special values of n_jobs to their actual values according to the formulas that apply.

        Parameters
        ----------
        n_jobs : int or None
            The original value of n_jobs, potentially having special values such as 'None' or
            negative integers.

        Returns
        -------
        n_jobs : int
            The value of n_jobs with special values converted to actual number of threads.
        """
        if n_jobs is None:
            n_jobs = _LGBMCpuCount(only_physical_cores=True)
        elif n_jobs < 0:
            n_jobs = max(_LGBMCpuCount(only_physical_cores=False) + 1 + n_jobs, 1)
        return n_jobs

706
707
    def fit(
        self,
708
709
        X: _LGBM_ScikitMatrixLike,
        y: _LGBM_LabelType,
710
711
        sample_weight=None,
        init_score=None,
712
        group: Optional[_LGBM_GroupType] = None,
713
        eval_set=None,
714
        eval_names: Optional[List[str]] = None,
715
716
717
718
        eval_sample_weight=None,
        eval_class_weight=None,
        eval_init_score=None,
        eval_group=None,
719
        eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
720
721
        feature_name: _LGBM_FeatureNameConfiguration = 'auto',
        categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
722
        callbacks: Optional[List[Callable]] = None,
723
        init_model: Optional[Union[str, Path, Booster, "LGBMModel"]] = None
724
    ) -> "LGBMModel":
725
726
727
728
729
730
731
732
733
734
735
736
737
        """Docstring is set after definition, using a template."""
        params = self._process_params(stage="fit")

        # Do not modify original args in fit function
        # Refer to https://github.com/microsoft/LightGBM/pull/2619
        eval_metric_list = copy.deepcopy(eval_metric)
        if not isinstance(eval_metric_list, list):
            eval_metric_list = [eval_metric_list]

        # Separate built-in from callable evaluation metrics
        eval_metrics_callable = [_EvalFunctionWrapper(f) for f in eval_metric_list if callable(f)]
        eval_metrics_builtin = [m for m in eval_metric_list if isinstance(m, str)]

738
        # concatenate metric from params (or default if not provided in params) and eval_metric
739
740
        params['metric'] = [params['metric']] if isinstance(params['metric'], (str, type(None))) else params['metric']
        params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric']
741
        params['metric'] = [metric for metric in params['metric'] if metric is not None]
wxchan's avatar
wxchan committed
742

743
        if not isinstance(X, (pd_DataFrame, dt_DataTable)):
744
            _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
745
746
            if sample_weight is not None:
                sample_weight = _LGBMCheckSampleWeight(sample_weight, _X)
747
748
        else:
            _X, _y = X, y
749

750
751
752
753
        if self._class_weight is None:
            self._class_weight = self.class_weight
        if self._class_weight is not None:
            class_sample_weight = _LGBMComputeSampleWeight(self._class_weight, y)
754
755
756
757
            if sample_weight is None or len(sample_weight) == 0:
                sample_weight = class_sample_weight
            else:
                sample_weight = np.multiply(sample_weight, class_sample_weight)
758

759
        self._n_features = _X.shape[1]
760
761
        # copy for consistency
        self._n_features_in = self._n_features
762

763
764
765
        train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
                            init_score=init_score, categorical_feature=categorical_feature,
                            params=params)
Guolin Ke's avatar
Guolin Ke committed
766

767
        valid_sets: List[Dataset] = []
Guolin Ke's avatar
Guolin Ke committed
768
        if eval_set is not None:
769

770
            def _get_meta_data(collection, name, i):
771
772
773
774
775
776
777
                if collection is None:
                    return None
                elif isinstance(collection, list):
                    return collection[i] if len(collection) > i else None
                elif isinstance(collection, dict):
                    return collection.get(i, None)
                else:
778
                    raise TypeError(f"{name} should be dict or list")
779

Guolin Ke's avatar
Guolin Ke committed
780
781
782
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
783
                # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
784
785
786
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
787
788
789
790
791
792
                    valid_weight = _get_meta_data(eval_sample_weight, 'eval_sample_weight', i)
                    valid_class_weight = _get_meta_data(eval_class_weight, 'eval_class_weight', i)
                    if valid_class_weight is not None:
                        if isinstance(valid_class_weight, dict) and self._class_map is not None:
                            valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
                        valid_class_sample_weight = _LGBMComputeSampleWeight(valid_class_weight, valid_data[1])
793
794
795
796
                        if valid_weight is None or len(valid_weight) == 0:
                            valid_weight = valid_class_sample_weight
                        else:
                            valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
797
798
                    valid_init_score = _get_meta_data(eval_init_score, 'eval_init_score', i)
                    valid_group = _get_meta_data(eval_group, 'eval_group', i)
799
800
801
802
                    valid_set = Dataset(data=valid_data[0], label=valid_data[1], weight=valid_weight,
                                        group=valid_group, init_score=valid_init_score,
                                        categorical_feature='auto', params=params)

Guolin Ke's avatar
Guolin Ke committed
803
804
                valid_sets.append(valid_set)

805
806
807
        if isinstance(init_model, LGBMModel):
            init_model = init_model.booster_

808
809
810
        if callbacks is None:
            callbacks = []
        else:
811
            callbacks = copy.copy(callbacks)  # don't use deepcopy here to allow non-serializable objects
812

813
        evals_result: _EvalResultDict = {}
814
815
816
817
818
819
820
821
822
823
824
825
826
        callbacks.append(record_evaluation(evals_result))

        self._Booster = train(
            params=params,
            train_set=train_set,
            num_boost_round=self.n_estimators,
            valid_sets=valid_sets,
            valid_names=eval_names,
            feval=eval_metrics_callable,
            init_model=init_model,
            feature_name=feature_name,
            callbacks=callbacks
        )
wxchan's avatar
wxchan committed
827

828
        self._evals_result = evals_result
829
        self._best_iteration = self._Booster.best_iteration
830
        self._best_score = self._Booster.best_score
wxchan's avatar
wxchan committed
831

832
833
        self.fitted_ = True

wxchan's avatar
wxchan committed
834
        # free dataset
835
        self._Booster.free_dataset()
wxchan's avatar
wxchan committed
836
        del train_set, valid_sets
wxchan's avatar
wxchan committed
837
838
        return self

839
    fit.__doc__ = _lgbmmodel_doc_fit.format(
840
841
        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
        y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
842
        sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
843
        init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
844
        group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
845
846
847
        eval_sample_weight_shape="list of array, or None, optional (default=None)",
        eval_init_score_shape="list of array, or None, optional (default=None)",
        eval_group_shape="list of array, or None, optional (default=None)"
848
849
    ) + "\n\n" + _lgbmmodel_doc_custom_eval_note

850
851
    def predict(
        self,
852
        X: _LGBM_ScikitMatrixLike,
853
854
855
856
857
858
859
860
        raw_score: bool = False,
        start_iteration: int = 0,
        num_iteration: Optional[int] = None,
        pred_leaf: bool = False,
        pred_contrib: bool = False,
        validate_features: bool = False,
        **kwargs: Any
    ):
861
        """Docstring is set after definition, using a template."""
862
        if not self.__sklearn_is_fitted__():
863
            raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
864
        if not isinstance(X, (pd_DataFrame, dt_DataTable)):
865
            X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False)
866
867
868
        n_features = X.shape[1]
        if self._n_features != n_features:
            raise ValueError("Number of features of the model must "
869
870
                             f"match the input. Model n_features_ is {self._n_features} and "
                             f"input n_features is {n_features}")
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
        # retrive original params that possibly can be used in both training and prediction
        # and then overwrite them (considering aliases) with params that were passed directly in prediction
        predict_params = self._process_params(stage="predict")
        for alias in _ConfigAliases.get_by_alias(
            "data",
            "X",
            "raw_score",
            "start_iteration",
            "num_iteration",
            "pred_leaf",
            "pred_contrib",
            *kwargs.keys()
        ):
            predict_params.pop(alias, None)
        predict_params.update(kwargs)
886
887
888

        # number of threads can have values with special meaning which is only applied
        # in the scikit-learn interface, these should not reach the c++ side as-is
889
890
        predict_params = _choose_param_value("num_threads", predict_params, self.n_jobs)
        predict_params["num_threads"] = self._process_n_jobs(predict_params["num_threads"])
891

892
893
894
895
896
        return self._Booster.predict(  # type: ignore[union-attr]
            X, raw_score=raw_score, start_iteration=start_iteration, num_iteration=num_iteration,
            pred_leaf=pred_leaf, pred_contrib=pred_contrib, validate_features=validate_features,
            **predict_params
        )
wxchan's avatar
wxchan committed
897

898
899
    predict.__doc__ = _lgbmmodel_doc_predict.format(
        description="Return the predicted value for each sample.",
900
        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
901
902
903
904
905
906
        output_name="predicted_result",
        predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
        X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
        X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
    )

907
    @property
908
    def n_features_(self) -> int:
909
        """:obj:`int`: The number of features of fitted model."""
910
        if not self.__sklearn_is_fitted__():
911
912
913
            raise LGBMNotFittedError('No n_features found. Need to call fit beforehand.')
        return self._n_features

914
    @property
915
    def n_features_in_(self) -> int:
916
        """:obj:`int`: The number of features of fitted model."""
917
        if not self.__sklearn_is_fitted__():
918
919
920
            raise LGBMNotFittedError('No n_features_in found. Need to call fit beforehand.')
        return self._n_features_in

921
    @property
922
    def best_score_(self) -> _LGBM_BoosterBestScoreType:
923
        """:obj:`dict`: The best score of fitted model."""
924
        if not self.__sklearn_is_fitted__():
925
926
927
928
            raise LGBMNotFittedError('No best_score found. Need to call fit beforehand.')
        return self._best_score

    @property
929
    def best_iteration_(self) -> int:
930
        """:obj:`int`: The best iteration of fitted model if ``early_stopping()`` callback has been specified."""
931
        if not self.__sklearn_is_fitted__():
932
            raise LGBMNotFittedError('No best_iteration found. Need to call fit with early_stopping callback beforehand.')
933
934
935
        return self._best_iteration

    @property
936
    def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]:
937
        """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model."""
938
        if not self.__sklearn_is_fitted__():
939
940
941
            raise LGBMNotFittedError('No objective found. Need to call fit beforehand.')
        return self._objective

942
943
944
945
946
947
948
949
950
    @property
    def n_estimators_(self) -> int:
        """:obj:`int`: True number of boosting iterations performed.

        This might be less than parameter ``n_estimators`` if early stopping was enabled or
        if boosting stopped early due to limits on complexity like ``min_gain_to_split``.
        """
        if not self.__sklearn_is_fitted__():
            raise LGBMNotFittedError('No n_estimators found. Need to call fit beforehand.')
951
        return self._Booster.current_iteration()  # type: ignore
952
953
954
955
956
957
958
959
960
961

    @property
    def n_iter_(self) -> int:
        """:obj:`int`: True number of boosting iterations performed.

        This might be less than parameter ``n_estimators`` if early stopping was enabled or
        if boosting stopped early due to limits on complexity like ``min_gain_to_split``.
        """
        if not self.__sklearn_is_fitted__():
            raise LGBMNotFittedError('No n_iter found. Need to call fit beforehand.')
962
        return self._Booster.current_iteration()  # type: ignore
963

964
    @property
965
    def booster_(self) -> Booster:
966
        """Booster: The underlying Booster of this model."""
967
        if not self.__sklearn_is_fitted__():
968
            raise LGBMNotFittedError('No booster found. Need to call fit beforehand.')
969
        return self._Booster  # type: ignore[return-value]
wxchan's avatar
wxchan committed
970

971
    @property
972
    def evals_result_(self) -> _EvalResultDict:
973
        """:obj:`dict`: The evaluation results if validation sets have been specified."""
974
        if not self.__sklearn_is_fitted__():
975
976
            raise LGBMNotFittedError('No results found. Need to call fit with eval_set beforehand.')
        return self._evals_result
977
978

    @property
979
    def feature_importances_(self) -> np.ndarray:
980
        """:obj:`array` of shape = [n_features]: The feature importances (the higher, the more important).
981

Nikita Titov's avatar
Nikita Titov committed
982
983
984
985
        .. note::

            ``importance_type`` attribute is passed to the function
            to configure the type of importance values to be extracted.
986
        """
987
        if not self.__sklearn_is_fitted__():
988
            raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
989
        return self._Booster.feature_importance(importance_type=self.importance_type)  # type: ignore[union-attr]
wxchan's avatar
wxchan committed
990

991
    @property
992
993
    def feature_name_(self) -> List[str]:
        """:obj:`list` of shape = [n_features]: The names of features."""
994
        if not self.__sklearn_is_fitted__():
995
            raise LGBMNotFittedError('No feature_name found. Need to call fit beforehand.')
996
        return self._Booster.feature_name()  # type: ignore[union-attr]
997

wxchan's avatar
wxchan committed
998

999
class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
1000
    """LightGBM regressor."""
wxchan's avatar
wxchan committed
1001

1002
    def fit(  # type: ignore[override]
1003
        self,
1004
1005
        X: _LGBM_ScikitMatrixLike,
        y: _LGBM_LabelType,
1006
1007
1008
        sample_weight=None,
        init_score=None,
        eval_set=None,
1009
        eval_names: Optional[List[str]] = None,
1010
1011
        eval_sample_weight=None,
        eval_init_score=None,
1012
        eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
1013
1014
        feature_name: _LGBM_FeatureNameConfiguration = 'auto',
        categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
1015
        callbacks: Optional[List[Callable]] = None,
1016
        init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
1017
    ) -> "LGBMRegressor":
1018
        """Docstring is inherited from the LGBMModel."""
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
        super().fit(
            X,
            y,
            sample_weight=sample_weight,
            init_score=init_score,
            eval_set=eval_set,
            eval_names=eval_names,
            eval_sample_weight=eval_sample_weight,
            eval_init_score=eval_init_score,
            eval_metric=eval_metric,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            callbacks=callbacks,
            init_model=init_model
        )
Guolin Ke's avatar
Guolin Ke committed
1034
1035
        return self

1036
    _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRegressor")  # type: ignore
1037
1038
    _base_doc = (_base_doc[:_base_doc.find('group :')]  # type: ignore
                 + _base_doc[_base_doc.find('eval_set :'):])  # type: ignore
1039
1040
1041
1042
    _base_doc = (_base_doc[:_base_doc.find('eval_class_weight :')]
                 + _base_doc[_base_doc.find('eval_init_score :'):])
    fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
                   + _base_doc[_base_doc.find('eval_metric :'):])
wxchan's avatar
wxchan committed
1043

1044

1045
class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
1046
    """LightGBM classifier."""
wxchan's avatar
wxchan committed
1047

1048
    def fit(  # type: ignore[override]
1049
        self,
1050
1051
        X: _LGBM_ScikitMatrixLike,
        y: _LGBM_LabelType,
1052
1053
1054
        sample_weight=None,
        init_score=None,
        eval_set=None,
1055
        eval_names: Optional[List[str]] = None,
1056
1057
1058
        eval_sample_weight=None,
        eval_class_weight=None,
        eval_init_score=None,
1059
        eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
1060
1061
        feature_name: _LGBM_FeatureNameConfiguration = 'auto',
        categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
1062
        callbacks: Optional[List[Callable]] = None,
1063
        init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
1064
    ) -> "LGBMClassifier":
1065
        """Docstring is inherited from the LGBMModel."""
1066
        _LGBMAssertAllFinite(y)
1067
1068
        _LGBMCheckClassificationTargets(y)
        self._le = _LGBMLabelEncoder().fit(y)
1069
        _y = self._le.transform(y)
1070
        self._class_map = dict(zip(self._le.classes_, self._le.transform(self._le.classes_)))
1071
1072
        if isinstance(self.class_weight, dict):
            self._class_weight = {self._class_map[k]: v for k, v in self.class_weight.items()}
1073

1074
        self._classes = self._le.classes_
1075
        self._n_classes = len(self._classes)  # type: ignore[arg-type]
1076

1077
1078
        # adjust eval metrics to match whether binary or multiclass
        # classification is being performed
1079
        if not callable(eval_metric):
1080
1081
1082
1083
1084
1085
            if isinstance(eval_metric, list):
                eval_metric_list = eval_metric
            elif isinstance(eval_metric, str):
                eval_metric_list = [eval_metric]
            else:
                eval_metric_list = []
1086
            if self._n_classes > 2:
1087
                for index, metric in enumerate(eval_metric_list):
1088
                    if metric in {'logloss', 'binary_logloss'}:
1089
                        eval_metric_list[index] = "multi_logloss"
1090
                    elif metric in {'error', 'binary_error'}:
1091
                        eval_metric_list[index] = "multi_error"
1092
            else:
1093
                for index, metric in enumerate(eval_metric_list):
1094
                    if metric in {'logloss', 'multi_logloss'}:
1095
                        eval_metric_list[index] = 'binary_logloss'
1096
                    elif metric in {'error', 'multi_error'}:
1097
1098
                        eval_metric_list[index] = 'binary_error'
            eval_metric = eval_metric_list
wxchan's avatar
wxchan committed
1099

1100
        # do not modify args, as it causes errors in model selection tools
1101
        valid_sets: Optional[List[Tuple]] = None
wxchan's avatar
wxchan committed
1102
        if eval_set is not None:
1103
1104
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
1105
1106
            valid_sets = []
            for valid_x, valid_y in eval_set:
1107
                if valid_x is X and valid_y is y:
1108
                    valid_sets.append((valid_x, _y))
1109
                else:
1110
                    valid_sets.append((valid_x, self._le.transform(valid_y)))
1111

1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
        super().fit(
            X,
            _y,
            sample_weight=sample_weight,
            init_score=init_score,
            eval_set=valid_sets,
            eval_names=eval_names,
            eval_sample_weight=eval_sample_weight,
            eval_class_weight=eval_class_weight,
            eval_init_score=eval_init_score,
            eval_metric=eval_metric,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            callbacks=callbacks,
            init_model=init_model
        )
wxchan's avatar
wxchan committed
1128
1129
        return self

1130
    _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMClassifier")  # type: ignore
1131
1132
    _base_doc = (_base_doc[:_base_doc.find('group :')]  # type: ignore
                 + _base_doc[_base_doc.find('eval_set :'):])  # type: ignore
1133
1134
    fit.__doc__ = (_base_doc[:_base_doc.find('eval_group :')]
                   + _base_doc[_base_doc.find('eval_metric :'):])
1135

1136
1137
    def predict(
        self,
1138
        X: _LGBM_ScikitMatrixLike,
1139
1140
1141
1142
1143
1144
1145
1146
        raw_score: bool = False,
        start_iteration: int = 0,
        num_iteration: Optional[int] = None,
        pred_leaf: bool = False,
        pred_contrib: bool = False,
        validate_features: bool = False,
        **kwargs: Any
    ):
1147
        """Docstring is inherited from the LGBMModel."""
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
        result = self.predict_proba(
            X=X,
            raw_score=raw_score,
            start_iteration=start_iteration,
            num_iteration=num_iteration,
            pred_leaf=pred_leaf,
            pred_contrib=pred_contrib,
            validate_features=validate_features,
            **kwargs
        )
1158
        if callable(self._objective) or raw_score or pred_leaf or pred_contrib:
1159
1160
1161
1162
            return result
        else:
            class_index = np.argmax(result, axis=1)
            return self._le.inverse_transform(class_index)
wxchan's avatar
wxchan committed
1163

1164
1165
    predict.__doc__ = LGBMModel.predict.__doc__

1166
1167
    def predict_proba(
        self,
1168
        X: _LGBM_ScikitMatrixLike,
1169
1170
1171
1172
1173
1174
1175
1176
        raw_score: bool = False,
        start_iteration: int = 0,
        num_iteration: Optional[int] = None,
        pred_leaf: bool = False,
        pred_contrib: bool = False,
        validate_features: bool = False,
        **kwargs: Any
    ):
1177
        """Docstring is set after definition, using a template."""
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
        result = super().predict(
            X=X,
            raw_score=raw_score,
            start_iteration=start_iteration,
            num_iteration=num_iteration,
            pred_leaf=pred_leaf,
            pred_contrib=pred_contrib,
            validate_features=validate_features,
            **kwargs
        )
1188
        if callable(self._objective) and not (raw_score or pred_leaf or pred_contrib):
1189
1190
1191
            _log_warning("Cannot compute class probabilities or labels "
                         "due to the usage of customized objective function.\n"
                         "Returning raw scores instead.")
1192
            return result
1193
        elif self._n_classes > 2 or raw_score or pred_leaf or pred_contrib:  # type: ignore [operator]
1194
            return result
wxchan's avatar
wxchan committed
1195
        else:
1196
            return np.vstack((1. - result, result)).transpose()
1197

1198
1199
    predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
        description="Return the predicted probability for each class for each sample.",
1200
        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
1201
        output_name="predicted_probability",
1202
        predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
1203
1204
1205
1206
        X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
        X_SHAP_values_shape="array-like of shape = [n_samples, n_features + 1] or shape = [n_samples, (n_features + 1) * n_classes] or list with n_classes length of such objects"
    )

1207
    @property
1208
    def classes_(self) -> np.ndarray:
1209
        """:obj:`array` of shape = [n_classes]: The class label array."""
1210
        if not self.__sklearn_is_fitted__():
1211
            raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
1212
        return self._classes  # type: ignore[return-value]
1213
1214

    @property
1215
    def n_classes_(self) -> int:
1216
        """:obj:`int`: The number of classes."""
1217
        if not self.__sklearn_is_fitted__():
1218
1219
            raise LGBMNotFittedError('No classes found. Need to call fit beforehand.')
        return self._n_classes
wxchan's avatar
wxchan committed
1220

wxchan's avatar
wxchan committed
1221

wxchan's avatar
wxchan committed
1222
class LGBMRanker(LGBMModel):
1223
1224
1225
1226
1227
1228
1229
1230
    """LightGBM ranker.

    .. warning::

        scikit-learn doesn't support ranking applications yet,
        therefore this class is not really compatible with the sklearn ecosystem.
        Please use this class mainly for training and applying ranking models in common sklearnish way.
    """
wxchan's avatar
wxchan committed
1231

1232
    def fit(  # type: ignore[override]
1233
        self,
1234
1235
        X: _LGBM_ScikitMatrixLike,
        y: _LGBM_LabelType,
1236
1237
        sample_weight=None,
        init_score=None,
1238
        group: Optional[_LGBM_GroupType] = None,
1239
        eval_set=None,
1240
        eval_names: Optional[List[str]] = None,
1241
1242
1243
        eval_sample_weight=None,
        eval_init_score=None,
        eval_group=None,
1244
        eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
1245
        eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
1246
1247
        feature_name: _LGBM_FeatureNameConfiguration = 'auto',
        categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
1248
        callbacks: Optional[List[Callable]] = None,
1249
        init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None
1250
    ) -> "LGBMRanker":
1251
        """Docstring is inherited from the LGBMModel."""
1252
        # check group data
Guolin Ke's avatar
Guolin Ke committed
1253
        if group is None:
1254
            raise ValueError("Should set group for ranking task")
wxchan's avatar
wxchan committed
1255
1256

        if eval_set is not None:
Guolin Ke's avatar
Guolin Ke committed
1257
            if eval_group is None:
1258
                raise ValueError("Eval_group cannot be None when eval_set is not None")
Guolin Ke's avatar
Guolin Ke committed
1259
            elif len(eval_group) != len(eval_set):
1260
                raise ValueError("Length of eval_group should be equal to eval_set")
1261
            elif (isinstance(eval_group, dict)
1262
                  and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))
1263
1264
                  or isinstance(eval_group, list)
                  and any(group is None for group in eval_group)):
1265
1266
                raise ValueError("Should set group for all eval datasets for ranking task; "
                                 "if you use dict, the index should start from 0")
1267

1268
        self._eval_at = eval_at
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
        super().fit(
            X,
            y,
            sample_weight=sample_weight,
            init_score=init_score,
            group=group,
            eval_set=eval_set,
            eval_names=eval_names,
            eval_sample_weight=eval_sample_weight,
            eval_init_score=eval_init_score,
            eval_group=eval_group,
            eval_metric=eval_metric,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            callbacks=callbacks,
            init_model=init_model
        )
wxchan's avatar
wxchan committed
1286
        return self
1287

1288
    _base_doc = LGBMModel.fit.__doc__.replace("self : LGBMModel", "self : LGBMRanker")  # type: ignore
1289
1290
    fit.__doc__ = (_base_doc[:_base_doc.find('eval_class_weight :')]  # type: ignore
                   + _base_doc[_base_doc.find('eval_init_score :'):])  # type: ignore
1291
    _base_doc = fit.__doc__
1292
    _before_feature_name, _feature_name, _after_feature_name = _base_doc.partition('feature_name :')
1293
    fit.__doc__ = f"""{_before_feature_name}eval_at : list or tuple of int, optional (default=(1, 2, 3, 4, 5))
1294
        The evaluation positions of the specified metric.
1295
    {_feature_name}{_after_feature_name}"""