engine.py 37.8 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Library with training routines of LightGBM."""
3

4
import copy
5
import json
6
import warnings
7
from collections import OrderedDict, defaultdict
wxchan's avatar
wxchan committed
8
from operator import attrgetter
9
from pathlib import Path
10
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
11

wxchan's avatar
wxchan committed
12
import numpy as np
13

wxchan's avatar
wxchan committed
14
from . import callback
15
16
17
from .basic import (
    Booster,
    Dataset,
18
    LGBMDeprecationWarning,
19
20
21
22
23
24
25
26
27
28
29
30
    LightGBMError,
    _choose_param_value,
    _ConfigAliases,
    _InnerPredictor,
    _LGBM_BoosterEvalMethodResultType,
    _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
    _LGBM_CategoricalFeatureConfiguration,
    _LGBM_CustomObjectiveFunction,
    _LGBM_EvalFunctionResultType,
    _LGBM_FeatureNameConfiguration,
    _log_warning,
)
31
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
wxchan's avatar
wxchan committed
32

33
__all__ = [
34
35
36
    "cv",
    "CVBooster",
    "train",
37
38
39
]


40
41
42
43
44
45
46
_LGBM_CustomMetricFunction = Union[
    Callable[
        [np.ndarray, Dataset],
        _LGBM_EvalFunctionResultType,
    ],
    Callable[
        [np.ndarray, Dataset],
47
        List[_LGBM_EvalFunctionResultType],
48
    ],
49
]
wxchan's avatar
wxchan committed
50

51
52
_LGBM_PreprocFunction = Callable[
    [Dataset, Dataset, Dict[str, Any]],
53
    Tuple[Dataset, Dataset, Dict[str, Any]],
54
55
]

56

57
58
59
60
61
62
63
64
65
def _emit_dataset_kwarg_warning(calling_function: str, argname: str) -> None:
    msg = (
        f"Argument '{argname}' to {calling_function}() is deprecated and will be removed in "
        f"a future release. Set '{argname}' when calling lightgbm.Dataset() instead. "
        "See https://github.com/microsoft/LightGBM/issues/6435."
    )
    warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)


66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def _choose_num_iterations(num_boost_round_kwarg: int, params: Dict[str, Any]) -> Dict[str, Any]:
    """Choose number of boosting rounds.

    In ``train()`` and ``cv()``, there are multiple ways to provide configuration for
    the number of boosting rounds to perform:

      * the ``num_boost_round`` keyword argument
      * any of the ``num_iterations`` or its aliases via the ``params`` dictionary

    These should be preferred in the following order (first one found wins):

      1. ``num_iterations`` provided via ``params`` (because it's the main parameter name)
      2. any other aliases of ``num_iterations`` provided via ``params``
      3. the ``num_boost_round`` keyword argument

    This function handles that choice, and issuing helpful warnings in the cases where the
    result might be surprising.

    Returns
    -------
    params : dict
        Parameters, with ``"num_iterations"`` set to the preferred value and all other
        aliases of ``num_iterations`` removed.
    """
    num_iteration_configs_provided = {
        alias: params[alias] for alias in _ConfigAliases.get("num_iterations") if alias in params
    }

    # now that the relevant information has been pulled out of params, it's safe to overwrite it
    # with the content that should be used for training (i.e. with aliases resolved)
    params = _choose_param_value(
        main_param_name="num_iterations",
        params=params,
        default_value=num_boost_round_kwarg,
    )

    # if there were not multiple boosting rounds configurations provided in params,
    # then by definition they cannot have conflicting values... no need to warn
    if len(num_iteration_configs_provided) <= 1:
        return params

    # if all the aliases have the same value, no need to warn
    if len(set(num_iteration_configs_provided.values())) <= 1:
        return params

    # if this line is reached, lightgbm should warn
    value_string = ", ".join(f"{alias}={val}" for alias, val in num_iteration_configs_provided.items())
    _log_warning(
        f"Found conflicting values for num_iterations provided via 'params': {value_string}. "
        f"LightGBM will perform up to {params['num_iterations']} boosting rounds. "
        "To be confident in the maximum number of boosting rounds LightGBM will perform and to "
        "suppress this warning, modify 'params' so that only one of those is present."
    )
    return params


122
123
124
125
126
127
128
129
def train(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    valid_sets: Optional[List[Dataset]] = None,
    valid_names: Optional[List[str]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
130
131
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
132
    keep_training_booster: bool = False,
133
    callbacks: Optional[List[Callable]] = None,
134
) -> Booster:
135
    """Perform the training with given parameters.
wxchan's avatar
wxchan committed
136
137
138
139

    Parameters
    ----------
    params : dict
140
141
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
142
    train_set : Dataset
143
144
        Data to be trained on.
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
145
        Number of boosting iterations.
146
    valid_sets : list of Dataset, or None, optional (default=None)
147
        List of data to be evaluated on during training.
148
    valid_names : list of str, or None, optional (default=None)
149
        Names of ``valid_sets``.
150
    feval : callable, list of callable, or None, optional (default=None)
wxchan's avatar
wxchan committed
151
        Customized evaluation function.
Akshita Dixit's avatar
Akshita Dixit committed
152
        Each evaluation function should accept two parameters: preds, eval_data,
153
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
154

155
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
156
                The predicted values.
157
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
158
                If custom objective function is used, predicted values are returned before any transformation,
159
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
Akshita Dixit's avatar
Akshita Dixit committed
160
            eval_data : Dataset
161
                A ``Dataset`` to evaluate.
162
            eval_name : str
163
                The name of evaluation function (without whitespaces).
164
165
166
167
168
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

169
170
        To ignore the default metric corresponding to the used objective,
        set the ``metric`` parameter to the string ``"None"`` in ``params``.
171
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
172
        Filename of LightGBM model or Booster instance used for continue training.
173
    feature_name : list of str, or 'auto', optional (default="auto")
174
        **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
175
176
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
177
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
178
        **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
179
180
        Categorical features.
        If list of int, interpreted as indices.
181
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
182
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
183
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
184
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
185
        All negative values in categorical features will be treated as missing values.
186
        The output cannot be monotonically constrained with respect to a categorical feature.
187
        Floating point numbers in categorical features will be rounded towards 0.
188
189
190
    keep_training_booster : bool, optional (default=False)
        Whether the returned Booster will be used to keep training.
        If False, the returned value will be converted into _InnerPredictor before returning.
191
        This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.
192
193
        When your model is very large and cause the memory error,
        you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.
194
        You can still use _InnerPredictor as ``init_model`` for future continue training.
195
    callbacks : list of callable, or None, optional (default=None)
196
        List of callback functions that are applied at each iteration.
197
        See Callbacks in Python API for more information.
wxchan's avatar
wxchan committed
198

199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
220
221
    Returns
    -------
222
223
    booster : Booster
        The trained Booster model.
wxchan's avatar
wxchan committed
224
    """
225
226
227
228
229
230
231
232
233
234
235
    if not isinstance(train_set, Dataset):
        raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if isinstance(valid_sets, list):
        for i, valid_item in enumerate(valid_sets):
            if not isinstance(valid_item, Dataset):
                raise TypeError(
                    "Every item in valid_sets must be a Dataset object. "
                    f"Item {i} has type '{type(valid_item).__name__}'."
                )

236
237
238
239
240
241
242
    # raise deprecation warnings if necessary
    # ref: https://github.com/microsoft/LightGBM/issues/6435
    if categorical_feature != "auto":
        _emit_dataset_kwarg_warning("train", "categorical_feature")
    if feature_name != "auto":
        _emit_dataset_kwarg_warning("train", "feature_name")

243
    # create predictor first
244
    params = copy.deepcopy(params)
245
    params = _choose_param_value(
246
        main_param_name="objective",
247
        params=params,
248
        default_value=None,
249
    )
250
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
251
252
    if callable(params["objective"]):
        fobj = params["objective"]
253
        params["objective"] = "none"
254
255
256
257
258
259

    params = _choose_num_iterations(num_boost_round_kwarg=num_boost_round, params=params)
    num_boost_round = params["num_iterations"]
    if num_boost_round <= 0:
        raise ValueError(f"Number of boosting rounds must be greater than 0. Got {num_boost_round}.")

260
261
262
263
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
264
        default_value=None,
265
266
267
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
268
    first_metric_only = params.get("first_metric_only", False)
269

270
    predictor: Optional[_InnerPredictor] = None
271
    if isinstance(init_model, (str, Path)):
272
        predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params)
wxchan's avatar
wxchan committed
273
    elif isinstance(init_model, Booster):
274
        predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params))
275
276
277
278
279

    if predictor is not None:
        init_iteration = predictor.current_iteration()
    else:
        init_iteration = 0
Guolin Ke's avatar
Guolin Ke committed
280

281
282
283
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
Guolin Ke's avatar
Guolin Ke committed
284

wxchan's avatar
wxchan committed
285
286
    is_valid_contain_train = False
    train_data_name = "training"
Guolin Ke's avatar
Guolin Ke committed
287
    reduced_valid_sets = []
wxchan's avatar
wxchan committed
288
    name_valid_sets = []
289
    if valid_sets is not None:
Guolin Ke's avatar
Guolin Ke committed
290
291
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
292
        if isinstance(valid_names, str):
wxchan's avatar
wxchan committed
293
            valid_names = [valid_names]
Guolin Ke's avatar
Guolin Ke committed
294
        for i, valid_data in enumerate(valid_sets):
295
            # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
296
            if valid_data is train_set:
wxchan's avatar
wxchan committed
297
298
299
300
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
Nikita Titov's avatar
Nikita Titov committed
301
            reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set))
302
            if valid_names is not None and len(valid_names) > i:
wxchan's avatar
wxchan committed
303
304
                name_valid_sets.append(valid_names[i])
            else:
305
                name_valid_sets.append(f"valid_{i}")
306
    # process callbacks
307
    if callbacks is None:
308
        callbacks_set = set()
wxchan's avatar
wxchan committed
309
310
    else:
        for i, cb in enumerate(callbacks):
311
            cb.__dict__.setdefault("order", i - len(callbacks))
312
        callbacks_set = set(callbacks)
wxchan's avatar
wxchan committed
313

314
    if callback._should_enable_early_stopping(params.get("early_stopping_round", 0)):
315
316
        callbacks_set.add(
            callback.early_stopping(
317
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
318
                first_metric_only=first_metric_only,
319
                min_delta=params.get("early_stopping_min_delta", 0.0),
320
321
322
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
323
324
325
                    default_value=1,
                ).pop("verbosity")
                > 0,
326
327
            )
        )
328

329
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
330
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
331
332
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
333

334
    # construct booster
335
336
337
338
    try:
        booster = Booster(params=params, train_set=train_set)
        if is_valid_contain_train:
            booster.set_train_data_name(train_data_name)
339
        for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
340
341
342
343
344
            booster.add_valid(valid_set, name_valid_set)
    finally:
        train_set._reverse_update_params()
        for valid_set in reduced_valid_sets:
            valid_set._reverse_update_params()
345
    booster.best_iteration = 0
wxchan's avatar
wxchan committed
346

347
    # start training
348
    for i in range(init_iteration, init_iteration + num_boost_round):
wxchan's avatar
wxchan committed
349
        for cb in callbacks_before_iter:
350
351
352
353
354
355
356
357
358
359
            cb(
                callback.CallbackEnv(
                    model=booster,
                    params=params,
                    iteration=i,
                    begin_iteration=init_iteration,
                    end_iteration=init_iteration + num_boost_round,
                    evaluation_result_list=None,
                )
            )
wxchan's avatar
wxchan committed
360
361
362

        booster.update(fobj=fobj)

363
        evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
wxchan's avatar
wxchan committed
364
        # check evaluation result.
365
        if valid_sets is not None:
wxchan's avatar
wxchan committed
366
367
368
369
370
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
371
372
373
374
375
376
377
378
379
380
                cb(
                    callback.CallbackEnv(
                        model=booster,
                        params=params,
                        iteration=i,
                        begin_iteration=init_iteration,
                        end_iteration=init_iteration + num_boost_round,
                        evaluation_result_list=evaluation_result_list,
                    )
                )
381
382
        except callback.EarlyStopException as earlyStopException:
            booster.best_iteration = earlyStopException.best_iteration + 1
wxchan's avatar
wxchan committed
383
            evaluation_result_list = earlyStopException.best_score
wxchan's avatar
wxchan committed
384
            break
385
    booster.best_score = defaultdict(OrderedDict)
wxchan's avatar
wxchan committed
386
387
    for dataset_name, eval_name, score, _ in evaluation_result_list:
        booster.best_score[dataset_name][eval_name] = score
388
    if not keep_training_booster:
389
        booster.model_from_string(booster.model_to_string()).free_dataset()
wxchan's avatar
wxchan committed
390
391
392
    return booster


393
class CVBooster:
394
395
    """CVBooster in LightGBM.

396
    Auxiliary data structure to hold and redirect all boosters of ``cv()`` function.
397
    This class has the same methods as Booster class.
398
399
400
401
402
403
    All method calls, except for the following methods, are actually performed for underlying Boosters and
    then all returned results are returned in a list.

    - ``model_from_string()``
    - ``model_to_string()``
    - ``save_model()``
404
405
406
407
408
409
410
411

    Attributes
    ----------
    boosters : list of Booster
        The list of underlying fitted models.
    best_iteration : int
        The best iteration of fitted model.
    """
412

413
414
    def __init__(
        self,
415
        model_file: Optional[Union[str, Path]] = None,
416
    ):
417
418
        """Initialize the CVBooster.

419
420
421
422
        Parameters
        ----------
        model_file : str, pathlib.Path or None, optional (default=None)
            Path to the CVBooster model file.
423
        """
424
        self.boosters: List[Booster] = []
425
        self.best_iteration = -1
426

427
428
429
430
431
432
433
434
435
        if model_file is not None:
            with open(model_file, "r") as file:
                self._from_dict(json.load(file))

    def _from_dict(self, models: Dict[str, Any]) -> None:
        """Load CVBooster from dict."""
        self.best_iteration = models["best_iteration"]
        self.boosters = []
        for model_str in models["boosters"]:
436
            self.boosters.append(Booster(model_str=model_str))
437
438
439
440
441

    def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]:
        """Serialize CVBooster to dict."""
        models_str = []
        for booster in self.boosters:
442
443
444
445
446
            models_str.append(
                booster.model_to_string(
                    num_iteration=num_iteration, start_iteration=start_iteration, importance_type=importance_type
                )
            )
447
448
        return {"boosters": models_str, "best_iteration": self.best_iteration}

449
    def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
450
        """Redirect methods call of CVBooster."""
451

452
        def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
453
            """Call methods with each booster, and concatenate their results."""
454
455
456
457
            ret = []
            for booster in self.boosters:
                ret.append(getattr(booster, name)(*args, **kwargs))
            return ret
458

459
        return handler_function
wxchan's avatar
wxchan committed
460

461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
    def __getstate__(self) -> Dict[str, Any]:
        return vars(self)

    def __setstate__(self, state: Dict[str, Any]) -> None:
        vars(self).update(state)

    def model_from_string(self, model_str: str) -> "CVBooster":
        """Load CVBooster from a string.

        Parameters
        ----------
        model_str : str
            Model will be loaded from this string.

        Returns
        -------
        self : CVBooster
            Loaded CVBooster object.
        """
        self._from_dict(json.loads(model_str))
        return self

    def model_to_string(
        self,
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
487
        importance_type: str = "split",
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
    ) -> str:
        """Save CVBooster to JSON string.

        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        str_repr : str
            JSON string representation of CVBooster.
        """
        return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type))

    def save_model(
        self,
        filename: Union[str, Path],
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
516
        importance_type: str = "split",
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
    ) -> "CVBooster":
        """Save CVBooster to a file as JSON text.

        Parameters
        ----------
        filename : str or pathlib.Path
            Filename to save CVBooster.
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        self : CVBooster
            Returns self.
        """
        with open(filename, "w") as file:
            json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file)

        return self

545

546
547
548
549
550
551
def _make_n_folds(
    full_data: Dataset,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]],
    nfold: int,
    params: Dict[str, Any],
    seed: int,
552
553
554
    fpreproc: Optional[_LGBM_PreprocFunction],
    stratified: bool,
    shuffle: bool,
555
    eval_train_metric: bool,
556
) -> CVBooster:
557
    """Make a n-fold list of Booster from random indices."""
wxchan's avatar
wxchan committed
558
559
    full_data = full_data.construct()
    num_data = full_data.num_data()
560
    if folds is not None:
561
562
563
564
565
566
        if not hasattr(folds, "__iter__") and not hasattr(folds, "split"):
            raise AttributeError(
                "folds should be a generator or iterator of (train_idx, test_idx) tuples "
                "or scikit-learn splitter object with split method"
            )
        if hasattr(folds, "split"):
567
568
            group_info = full_data.get_group()
            if group_info is not None:
569
                group_info = np.asarray(group_info, dtype=np.int32)
570
                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
571
            else:
572
                flatted_group = np.zeros(num_data, dtype=np.int32)
573
            folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
wxchan's avatar
wxchan committed
574
    else:
575
576
577
578
579
        if any(
            params.get(obj_alias, "")
            in {"lambdarank", "rank_xendcg", "xendcg", "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
            for obj_alias in _ConfigAliases.get("objective")
        ):
wxchan's avatar
wxchan committed
580
            if not SKLEARN_INSTALLED:
581
                raise LightGBMError("scikit-learn is required for ranking cv")
582
            # ranking task, split according to groups
583
            group_info = np.asarray(full_data.get_group(), dtype=np.int32)
584
            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
585
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
586
            folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
wxchan's avatar
wxchan committed
587
588
        elif stratified:
            if not SKLEARN_INSTALLED:
589
                raise LightGBMError("scikit-learn is required for stratified cv")
590
            skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
591
            folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
extremin's avatar
extremin committed
592
        else:
wxchan's avatar
wxchan committed
593
594
595
596
597
            if shuffle:
                randidx = np.random.RandomState(seed).permutation(num_data)
            else:
                randidx = np.arange(num_data)
            kstep = int(num_data / nfold)
598
            test_id = [randidx[i : i + kstep] for i in range(0, num_data, kstep)]
599
600
            train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
            folds = zip(train_id, test_id)
wxchan's avatar
wxchan committed
601

602
    ret = CVBooster()
wxchan's avatar
wxchan committed
603
    for train_idx, test_idx in folds:
604
605
        train_set = full_data.subset(sorted(train_idx))
        valid_set = full_data.subset(sorted(test_idx))
wxchan's avatar
wxchan committed
606
607
        # run preprocessing on the data set if needed
        if fpreproc is not None:
wxchan's avatar
wxchan committed
608
            train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
wxchan's avatar
wxchan committed
609
        else:
wxchan's avatar
wxchan committed
610
            tparam = params
611
        booster_for_fold = Booster(tparam, train_set)
612
        if eval_train_metric:
613
614
            booster_for_fold.add_valid(train_set, "train")
        booster_for_fold.add_valid(valid_set, "valid")
615
        ret.boosters.append(booster_for_fold)
wxchan's avatar
wxchan committed
616
617
    return ret

wxchan's avatar
wxchan committed
618

619
def _agg_cv_result(
620
    raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
621
) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
622
    """Aggregate cross-validation results."""
623
    cvmap: Dict[str, List[float]] = OrderedDict()
624
    metric_type: Dict[str, bool] = {}
wxchan's avatar
wxchan committed
625
626
    for one_result in raw_results:
        for one_line in one_result:
627
            key = f"{one_line[0]} {one_line[1]}"
628
            metric_type[key] = one_line[3]
629
            cvmap.setdefault(key, [])
630
            cvmap[key].append(one_line[2])
631
    return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
wxchan's avatar
wxchan committed
632

wxchan's avatar
wxchan committed
633

634
635
636
637
638
639
640
641
642
643
644
def cv(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]] = None,
    nfold: int = 5,
    stratified: bool = True,
    shuffle: bool = True,
    metrics: Optional[Union[str, List[str]]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
645
646
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
647
648
649
650
    fpreproc: Optional[_LGBM_PreprocFunction] = None,
    seed: int = 0,
    callbacks: Optional[List[Callable]] = None,
    eval_train_metric: bool = False,
651
    return_cvbooster: bool = False,
652
) -> Dict[str, Union[List[float], CVBooster]]:
Andrew Ziem's avatar
Andrew Ziem committed
653
    """Perform the cross-validation with given parameters.
wxchan's avatar
wxchan committed
654
655
656
657

    Parameters
    ----------
    params : dict
658
659
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
660
    train_set : Dataset
661
        Data to be trained on.
662
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
663
        Number of boosting iterations.
664
    folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
665
        If generator or iterator, it should yield the train and test indices for each fold.
666
        If object, it should be one of the scikit-learn splitter classes
667
        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
668
        and have ``split`` method.
669
        This argument has highest priority over other data split arguments.
670
    nfold : int, optional (default=5)
wxchan's avatar
wxchan committed
671
        Number of folds in CV.
672
673
    stratified : bool, optional (default=True)
        Whether to perform stratified sampling.
674
    shuffle : bool, optional (default=True)
675
        Whether to shuffle before splitting data.
676
    metrics : str, list of str, or None, optional (default=None)
677
678
        Evaluation metrics to be monitored while CV.
        If not None, the metric in ``params`` will be overridden.
679
    feval : callable, list of callable, or None, optional (default=None)
680
        Customized evaluation function.
681
        Each evaluation function should accept two parameters: preds, eval_data,
682
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
683

684
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
685
                The predicted values.
686
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
687
                If custom objective function is used, predicted values are returned before any transformation,
688
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
689
690
            eval_data : Dataset
                A ``Dataset`` to evaluate.
691
            eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
692
                The name of evaluation function (without whitespace).
693
694
695
696
697
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

698
699
        To ignore the default metric corresponding to the used objective,
        set ``metrics`` to the string ``"None"``.
700
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
701
        Filename of LightGBM model or Booster instance used for continue training.
702
    feature_name : list of str, or 'auto', optional (default="auto")
703
        **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
704
705
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
706
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
707
        **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
708
709
        Categorical features.
        If list of int, interpreted as indices.
710
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
711
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
712
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
713
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
714
        All negative values in categorical features will be treated as missing values.
715
        The output cannot be monotonically constrained with respect to a categorical feature.
716
        Floating point numbers in categorical features will be rounded towards 0.
717
718
    fpreproc : callable or None, optional (default=None)
        Preprocessing function that takes (dtrain, dtest, params)
wxchan's avatar
wxchan committed
719
        and returns transformed versions of those.
720
    seed : int, optional (default=0)
wxchan's avatar
wxchan committed
721
        Seed used to generate the folds (passed to numpy.random.seed).
722
    callbacks : list of callable, or None, optional (default=None)
723
        List of callback functions that are applied at each iteration.
724
        See Callbacks in Python API for more information.
725
726
727
    eval_train_metric : bool, optional (default=False)
        Whether to display the train metric in progress.
        The score of the metric is calculated again after each training step, so there is some impact on performance.
728
729
    return_cvbooster : bool, optional (default=False)
        Whether to return Booster models trained on each fold through ``CVBooster``.
wxchan's avatar
wxchan committed
730

731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
752
753
    Returns
    -------
754
755
    eval_results : dict
        History of evaluation results of each metric.
756
        The dictionary has the following format:
757
758
        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],
        'valid metric2-mean': [values], 'valid metric2-stdv': [values],
759
        ...}.
760
        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.
761
762
763
764
765
        If ``eval_train_metric=True``, also returns the train metric history.
        In this case, the dictionary has the following format:
        {'train metric1-mean': [values], 'valid metric1-mean': [values],
        'train metric2-mean': [values], 'valid metric2-mean': [values],
        ...}.
wxchan's avatar
wxchan committed
766
    """
Guolin Ke's avatar
Guolin Ke committed
767
    if not isinstance(train_set, Dataset):
768
769
        raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

770
771
772
773
774
775
776
    # raise deprecation warnings if necessary
    # ref: https://github.com/microsoft/LightGBM/issues/6435
    if categorical_feature != "auto":
        _emit_dataset_kwarg_warning("cv", "categorical_feature")
    if feature_name != "auto":
        _emit_dataset_kwarg_warning("cv", "feature_name")

777
    params = copy.deepcopy(params)
778
    params = _choose_param_value(
779
        main_param_name="objective",
780
        params=params,
781
        default_value=None,
782
    )
783
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
784
785
    if callable(params["objective"]):
        fobj = params["objective"]
786
        params["objective"] = "none"
787
788
789
790
791
792

    params = _choose_num_iterations(num_boost_round_kwarg=num_boost_round, params=params)
    num_boost_round = params["num_iterations"]
    if num_boost_round <= 0:
        raise ValueError(f"Number of boosting rounds must be greater than 0. Got {num_boost_round}.")

793
794
795
796
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
797
        default_value=None,
798
799
800
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
801
    first_metric_only = params.get("first_metric_only", False)
802

803
    if isinstance(init_model, (str, Path)):
804
805
        predictor = _InnerPredictor.from_model_file(
            model_file=init_model,
806
            pred_parameter=params,
807
        )
Guolin Ke's avatar
Guolin Ke committed
808
    elif isinstance(init_model, Booster):
809
810
        predictor = _InnerPredictor.from_booster(
            booster=init_model,
811
            pred_parameter=dict(init_model.params, **params),
812
        )
Guolin Ke's avatar
Guolin Ke committed
813
814
815
    else:
        predictor = None

Peter's avatar
Peter committed
816
    if metrics is not None:
817
818
        for metric_alias in _ConfigAliases.get("metric"):
            params.pop(metric_alias, None)
819
        params["metric"] = metrics
wxchan's avatar
wxchan committed
820

821
822
823
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
824

825
    results = defaultdict(list)
826
827
828
829
830
831
832
833
834
835
836
    cvfolds = _make_n_folds(
        full_data=train_set,
        folds=folds,
        nfold=nfold,
        params=params,
        seed=seed,
        fpreproc=fpreproc,
        stratified=stratified,
        shuffle=shuffle,
        eval_train_metric=eval_train_metric,
    )
wxchan's avatar
wxchan committed
837
838

    # setup callbacks
839
    if callbacks is None:
840
        callbacks_set = set()
wxchan's avatar
wxchan committed
841
842
    else:
        for i, cb in enumerate(callbacks):
843
            cb.__dict__.setdefault("order", i - len(callbacks))
844
        callbacks_set = set(callbacks)
845

846
    if callback._should_enable_early_stopping(params.get("early_stopping_round", 0)):
847
        callbacks_set.add(
848
            callback.early_stopping(
849
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
850
                first_metric_only=first_metric_only,
851
                min_delta=params.get("early_stopping_min_delta", 0.0),
852
853
854
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
855
856
857
                    default_value=1,
                ).pop("verbosity")
                > 0,
858
859
            )
        )
wxchan's avatar
wxchan committed
860

861
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
862
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
863
864
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
865

866
    for i in range(num_boost_round):
wxchan's avatar
wxchan committed
867
        for cb in callbacks_before_iter:
868
869
870
871
872
873
874
875
876
877
            cb(
                callback.CallbackEnv(
                    model=cvfolds,
                    params=params,
                    iteration=i,
                    begin_iteration=0,
                    end_iteration=num_boost_round,
                    evaluation_result_list=None,
                )
            )
878
879
        cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
        res = _agg_cv_result(cvfolds.eval_valid(feval))  # type: ignore[call-arg]
wxchan's avatar
wxchan committed
880
        for _, key, mean, _, std in res:
881
882
            results[f"{key}-mean"].append(mean)
            results[f"{key}-stdv"].append(std)
wxchan's avatar
wxchan committed
883
884
        try:
            for cb in callbacks_after_iter:
885
886
887
888
889
890
891
892
893
894
                cb(
                    callback.CallbackEnv(
                        model=cvfolds,
                        params=params,
                        iteration=i,
                        begin_iteration=0,
                        end_iteration=num_boost_round,
                        evaluation_result_list=res,
                    )
                )
895
896
        except callback.EarlyStopException as earlyStopException:
            cvfolds.best_iteration = earlyStopException.best_iteration + 1
897
898
            for bst in cvfolds.boosters:
                bst.best_iteration = cvfolds.best_iteration
wxchan's avatar
wxchan committed
899
            for k in results:
900
                results[k] = results[k][: cvfolds.best_iteration]
wxchan's avatar
wxchan committed
901
            break
902
903

    if return_cvbooster:
904
        results["cvbooster"] = cvfolds  # type: ignore[assignment]
905

wxchan's avatar
wxchan committed
906
    return dict(results)