"src/io/multi_val_sparse_bin.cpp" did not exist on "b857ee10cc9a913e6dedd15c2475765d1e923c7b"
engine.py 35.7 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Library with training routines of LightGBM."""
3
import copy
4
import json
5
import warnings
6
from collections import OrderedDict, defaultdict
wxchan's avatar
wxchan committed
7
from operator import attrgetter
8
from pathlib import Path
9
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
10

wxchan's avatar
wxchan committed
11
import numpy as np
12

wxchan's avatar
wxchan committed
13
from . import callback
14
15
16
from .basic import (
    Booster,
    Dataset,
17
    LGBMDeprecationWarning,
18
19
20
21
22
23
24
25
26
27
28
29
    LightGBMError,
    _choose_param_value,
    _ConfigAliases,
    _InnerPredictor,
    _LGBM_BoosterEvalMethodResultType,
    _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
    _LGBM_CategoricalFeatureConfiguration,
    _LGBM_CustomObjectiveFunction,
    _LGBM_EvalFunctionResultType,
    _LGBM_FeatureNameConfiguration,
    _log_warning,
)
30
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
wxchan's avatar
wxchan committed
31

32
__all__ = [
33
34
35
    "cv",
    "CVBooster",
    "train",
36
37
38
]


39
40
41
42
43
44
45
_LGBM_CustomMetricFunction = Union[
    Callable[
        [np.ndarray, Dataset],
        _LGBM_EvalFunctionResultType,
    ],
    Callable[
        [np.ndarray, Dataset],
46
        List[_LGBM_EvalFunctionResultType],
47
    ],
48
]
wxchan's avatar
wxchan committed
49

50
51
_LGBM_PreprocFunction = Callable[
    [Dataset, Dataset, Dict[str, Any]],
52
    Tuple[Dataset, Dataset, Dict[str, Any]],
53
54
]

55

56
57
58
59
60
61
62
63
64
def _emit_dataset_kwarg_warning(calling_function: str, argname: str) -> None:
    msg = (
        f"Argument '{argname}' to {calling_function}() is deprecated and will be removed in "
        f"a future release. Set '{argname}' when calling lightgbm.Dataset() instead. "
        "See https://github.com/microsoft/LightGBM/issues/6435."
    )
    warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)


65
66
67
68
69
70
71
72
def train(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    valid_sets: Optional[List[Dataset]] = None,
    valid_names: Optional[List[str]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
73
74
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
75
    keep_training_booster: bool = False,
76
    callbacks: Optional[List[Callable]] = None,
77
) -> Booster:
78
    """Perform the training with given parameters.
wxchan's avatar
wxchan committed
79
80
81
82

    Parameters
    ----------
    params : dict
83
84
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
85
    train_set : Dataset
86
87
        Data to be trained on.
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
88
        Number of boosting iterations.
89
    valid_sets : list of Dataset, or None, optional (default=None)
90
        List of data to be evaluated on during training.
91
    valid_names : list of str, or None, optional (default=None)
92
        Names of ``valid_sets``.
93
    feval : callable, list of callable, or None, optional (default=None)
wxchan's avatar
wxchan committed
94
        Customized evaluation function.
Akshita Dixit's avatar
Akshita Dixit committed
95
        Each evaluation function should accept two parameters: preds, eval_data,
96
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
97

98
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
99
                The predicted values.
100
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
101
                If custom objective function is used, predicted values are returned before any transformation,
102
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
Akshita Dixit's avatar
Akshita Dixit committed
103
            eval_data : Dataset
104
                A ``Dataset`` to evaluate.
105
            eval_name : str
106
                The name of evaluation function (without whitespaces).
107
108
109
110
111
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

112
113
        To ignore the default metric corresponding to the used objective,
        set the ``metric`` parameter to the string ``"None"`` in ``params``.
114
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
115
        Filename of LightGBM model or Booster instance used for continue training.
116
    feature_name : list of str, or 'auto', optional (default="auto")
117
        **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
118
119
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
120
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
121
        **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
122
123
        Categorical features.
        If list of int, interpreted as indices.
124
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
125
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
126
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
127
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
128
        All negative values in categorical features will be treated as missing values.
129
        The output cannot be monotonically constrained with respect to a categorical feature.
130
        Floating point numbers in categorical features will be rounded towards 0.
131
132
133
    keep_training_booster : bool, optional (default=False)
        Whether the returned Booster will be used to keep training.
        If False, the returned value will be converted into _InnerPredictor before returning.
134
        This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.
135
136
        When your model is very large and cause the memory error,
        you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.
137
        You can still use _InnerPredictor as ``init_model`` for future continue training.
138
    callbacks : list of callable, or None, optional (default=None)
139
        List of callback functions that are applied at each iteration.
140
        See Callbacks in Python API for more information.
wxchan's avatar
wxchan committed
141

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
163
164
    Returns
    -------
165
166
    booster : Booster
        The trained Booster model.
wxchan's avatar
wxchan committed
167
    """
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    if not isinstance(train_set, Dataset):
        raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if num_boost_round <= 0:
        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")

    if isinstance(valid_sets, list):
        for i, valid_item in enumerate(valid_sets):
            if not isinstance(valid_item, Dataset):
                raise TypeError(
                    "Every item in valid_sets must be a Dataset object. "
                    f"Item {i} has type '{type(valid_item).__name__}'."
                )

182
183
184
185
186
187
188
    # raise deprecation warnings if necessary
    # ref: https://github.com/microsoft/LightGBM/issues/6435
    if categorical_feature != "auto":
        _emit_dataset_kwarg_warning("train", "categorical_feature")
    if feature_name != "auto":
        _emit_dataset_kwarg_warning("train", "feature_name")

189
    # create predictor first
190
    params = copy.deepcopy(params)
191
    params = _choose_param_value(
192
        main_param_name="objective",
193
        params=params,
194
        default_value=None,
195
    )
196
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
197
198
    if callable(params["objective"]):
        fobj = params["objective"]
199
        params["objective"] = "none"
200
    for alias in _ConfigAliases.get("num_iterations"):
201
        if alias in params:
202
            num_boost_round = params.pop(alias)
203
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
204
    params["num_iterations"] = num_boost_round
205
206
207
208
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
209
        default_value=None,
210
211
212
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
213
    first_metric_only = params.get("first_metric_only", False)
214

215
    predictor: Optional[_InnerPredictor] = None
216
    if isinstance(init_model, (str, Path)):
217
        predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params)
wxchan's avatar
wxchan committed
218
    elif isinstance(init_model, Booster):
219
        predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params))
220
221
222
223
224

    if predictor is not None:
        init_iteration = predictor.current_iteration()
    else:
        init_iteration = 0
Guolin Ke's avatar
Guolin Ke committed
225

226
227
228
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
Guolin Ke's avatar
Guolin Ke committed
229

wxchan's avatar
wxchan committed
230
231
    is_valid_contain_train = False
    train_data_name = "training"
Guolin Ke's avatar
Guolin Ke committed
232
    reduced_valid_sets = []
wxchan's avatar
wxchan committed
233
    name_valid_sets = []
234
    if valid_sets is not None:
Guolin Ke's avatar
Guolin Ke committed
235
236
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
237
        if isinstance(valid_names, str):
wxchan's avatar
wxchan committed
238
            valid_names = [valid_names]
Guolin Ke's avatar
Guolin Ke committed
239
        for i, valid_data in enumerate(valid_sets):
240
            # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
241
            if valid_data is train_set:
wxchan's avatar
wxchan committed
242
243
244
245
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
Nikita Titov's avatar
Nikita Titov committed
246
            reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set))
247
            if valid_names is not None and len(valid_names) > i:
wxchan's avatar
wxchan committed
248
249
                name_valid_sets.append(valid_names[i])
            else:
250
                name_valid_sets.append(f"valid_{i}")
251
    # process callbacks
252
    if callbacks is None:
253
        callbacks_set = set()
wxchan's avatar
wxchan committed
254
255
    else:
        for i, cb in enumerate(callbacks):
256
            cb.__dict__.setdefault("order", i - len(callbacks))
257
        callbacks_set = set(callbacks)
wxchan's avatar
wxchan committed
258

259
    if callback._should_enable_early_stopping(params.get("early_stopping_round", 0)):
260
261
        callbacks_set.add(
            callback.early_stopping(
262
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
263
                first_metric_only=first_metric_only,
264
                min_delta=params.get("early_stopping_min_delta", 0.0),
265
266
267
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
268
269
270
                    default_value=1,
                ).pop("verbosity")
                > 0,
271
272
            )
        )
273

274
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
275
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
276
277
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
278

279
    # construct booster
280
281
282
283
    try:
        booster = Booster(params=params, train_set=train_set)
        if is_valid_contain_train:
            booster.set_train_data_name(train_data_name)
284
        for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
285
286
287
288
289
            booster.add_valid(valid_set, name_valid_set)
    finally:
        train_set._reverse_update_params()
        for valid_set in reduced_valid_sets:
            valid_set._reverse_update_params()
290
    booster.best_iteration = 0
wxchan's avatar
wxchan committed
291

292
    # start training
293
    for i in range(init_iteration, init_iteration + num_boost_round):
wxchan's avatar
wxchan committed
294
        for cb in callbacks_before_iter:
295
296
297
298
299
300
301
302
303
304
            cb(
                callback.CallbackEnv(
                    model=booster,
                    params=params,
                    iteration=i,
                    begin_iteration=init_iteration,
                    end_iteration=init_iteration + num_boost_round,
                    evaluation_result_list=None,
                )
            )
wxchan's avatar
wxchan committed
305
306
307

        booster.update(fobj=fobj)

308
        evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
wxchan's avatar
wxchan committed
309
        # check evaluation result.
310
        if valid_sets is not None:
wxchan's avatar
wxchan committed
311
312
313
314
315
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
316
317
318
319
320
321
322
323
324
325
                cb(
                    callback.CallbackEnv(
                        model=booster,
                        params=params,
                        iteration=i,
                        begin_iteration=init_iteration,
                        end_iteration=init_iteration + num_boost_round,
                        evaluation_result_list=evaluation_result_list,
                    )
                )
326
327
        except callback.EarlyStopException as earlyStopException:
            booster.best_iteration = earlyStopException.best_iteration + 1
wxchan's avatar
wxchan committed
328
            evaluation_result_list = earlyStopException.best_score
wxchan's avatar
wxchan committed
329
            break
330
    booster.best_score = defaultdict(OrderedDict)
wxchan's avatar
wxchan committed
331
332
    for dataset_name, eval_name, score, _ in evaluation_result_list:
        booster.best_score[dataset_name][eval_name] = score
333
    if not keep_training_booster:
334
        booster.model_from_string(booster.model_to_string()).free_dataset()
wxchan's avatar
wxchan committed
335
336
337
    return booster


338
class CVBooster:
339
340
    """CVBooster in LightGBM.

341
    Auxiliary data structure to hold and redirect all boosters of ``cv()`` function.
342
    This class has the same methods as Booster class.
343
344
345
346
347
348
    All method calls, except for the following methods, are actually performed for underlying Boosters and
    then all returned results are returned in a list.

    - ``model_from_string()``
    - ``model_to_string()``
    - ``save_model()``
349
350
351
352
353
354
355
356

    Attributes
    ----------
    boosters : list of Booster
        The list of underlying fitted models.
    best_iteration : int
        The best iteration of fitted model.
    """
357

358
359
    def __init__(
        self,
360
        model_file: Optional[Union[str, Path]] = None,
361
    ):
362
363
        """Initialize the CVBooster.

364
365
366
367
        Parameters
        ----------
        model_file : str, pathlib.Path or None, optional (default=None)
            Path to the CVBooster model file.
368
        """
369
        self.boosters: List[Booster] = []
370
        self.best_iteration = -1
371

372
373
374
375
376
377
378
379
380
        if model_file is not None:
            with open(model_file, "r") as file:
                self._from_dict(json.load(file))

    def _from_dict(self, models: Dict[str, Any]) -> None:
        """Load CVBooster from dict."""
        self.best_iteration = models["best_iteration"]
        self.boosters = []
        for model_str in models["boosters"]:
381
            self.boosters.append(Booster(model_str=model_str))
382
383
384
385
386

    def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]:
        """Serialize CVBooster to dict."""
        models_str = []
        for booster in self.boosters:
387
388
389
390
391
            models_str.append(
                booster.model_to_string(
                    num_iteration=num_iteration, start_iteration=start_iteration, importance_type=importance_type
                )
            )
392
393
        return {"boosters": models_str, "best_iteration": self.best_iteration}

394
    def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
395
        """Redirect methods call of CVBooster."""
396

397
        def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
398
            """Call methods with each booster, and concatenate their results."""
399
400
401
402
            ret = []
            for booster in self.boosters:
                ret.append(getattr(booster, name)(*args, **kwargs))
            return ret
403

404
        return handler_function
wxchan's avatar
wxchan committed
405

406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
    def __getstate__(self) -> Dict[str, Any]:
        return vars(self)

    def __setstate__(self, state: Dict[str, Any]) -> None:
        vars(self).update(state)

    def model_from_string(self, model_str: str) -> "CVBooster":
        """Load CVBooster from a string.

        Parameters
        ----------
        model_str : str
            Model will be loaded from this string.

        Returns
        -------
        self : CVBooster
            Loaded CVBooster object.
        """
        self._from_dict(json.loads(model_str))
        return self

    def model_to_string(
        self,
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
432
        importance_type: str = "split",
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
    ) -> str:
        """Save CVBooster to JSON string.

        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        str_repr : str
            JSON string representation of CVBooster.
        """
        return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type))

    def save_model(
        self,
        filename: Union[str, Path],
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
461
        importance_type: str = "split",
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
    ) -> "CVBooster":
        """Save CVBooster to a file as JSON text.

        Parameters
        ----------
        filename : str or pathlib.Path
            Filename to save CVBooster.
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        self : CVBooster
            Returns self.
        """
        with open(filename, "w") as file:
            json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file)

        return self

490

491
492
493
494
495
496
def _make_n_folds(
    full_data: Dataset,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]],
    nfold: int,
    params: Dict[str, Any],
    seed: int,
497
498
499
    fpreproc: Optional[_LGBM_PreprocFunction],
    stratified: bool,
    shuffle: bool,
500
    eval_train_metric: bool,
501
) -> CVBooster:
502
    """Make a n-fold list of Booster from random indices."""
wxchan's avatar
wxchan committed
503
504
    full_data = full_data.construct()
    num_data = full_data.num_data()
505
    if folds is not None:
506
507
508
509
510
511
        if not hasattr(folds, "__iter__") and not hasattr(folds, "split"):
            raise AttributeError(
                "folds should be a generator or iterator of (train_idx, test_idx) tuples "
                "or scikit-learn splitter object with split method"
            )
        if hasattr(folds, "split"):
512
513
            group_info = full_data.get_group()
            if group_info is not None:
514
                group_info = np.array(group_info, dtype=np.int32, copy=False)
515
                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
516
            else:
517
                flatted_group = np.zeros(num_data, dtype=np.int32)
518
            folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
wxchan's avatar
wxchan committed
519
    else:
520
521
522
523
524
        if any(
            params.get(obj_alias, "")
            in {"lambdarank", "rank_xendcg", "xendcg", "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
            for obj_alias in _ConfigAliases.get("objective")
        ):
wxchan's avatar
wxchan committed
525
            if not SKLEARN_INSTALLED:
526
                raise LightGBMError("scikit-learn is required for ranking cv")
527
            # ranking task, split according to groups
528
            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
529
            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
530
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
531
            folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
wxchan's avatar
wxchan committed
532
533
        elif stratified:
            if not SKLEARN_INSTALLED:
534
                raise LightGBMError("scikit-learn is required for stratified cv")
535
            skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
536
            folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
extremin's avatar
extremin committed
537
        else:
wxchan's avatar
wxchan committed
538
539
540
541
542
            if shuffle:
                randidx = np.random.RandomState(seed).permutation(num_data)
            else:
                randidx = np.arange(num_data)
            kstep = int(num_data / nfold)
543
            test_id = [randidx[i : i + kstep] for i in range(0, num_data, kstep)]
544
545
            train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
            folds = zip(train_id, test_id)
wxchan's avatar
wxchan committed
546

547
    ret = CVBooster()
wxchan's avatar
wxchan committed
548
    for train_idx, test_idx in folds:
549
550
        train_set = full_data.subset(sorted(train_idx))
        valid_set = full_data.subset(sorted(test_idx))
wxchan's avatar
wxchan committed
551
552
        # run preprocessing on the data set if needed
        if fpreproc is not None:
wxchan's avatar
wxchan committed
553
            train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
wxchan's avatar
wxchan committed
554
        else:
wxchan's avatar
wxchan committed
555
            tparam = params
556
        booster_for_fold = Booster(tparam, train_set)
557
        if eval_train_metric:
558
559
            booster_for_fold.add_valid(train_set, "train")
        booster_for_fold.add_valid(valid_set, "valid")
560
        ret.boosters.append(booster_for_fold)
wxchan's avatar
wxchan committed
561
562
    return ret

wxchan's avatar
wxchan committed
563

564
def _agg_cv_result(
565
    raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
566
) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
567
    """Aggregate cross-validation results."""
568
    cvmap: Dict[str, List[float]] = OrderedDict()
569
    metric_type: Dict[str, bool] = {}
wxchan's avatar
wxchan committed
570
571
    for one_result in raw_results:
        for one_line in one_result:
572
            key = f"{one_line[0]} {one_line[1]}"
573
            metric_type[key] = one_line[3]
574
            cvmap.setdefault(key, [])
575
            cvmap[key].append(one_line[2])
576
    return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
wxchan's avatar
wxchan committed
577

wxchan's avatar
wxchan committed
578

579
580
581
582
583
584
585
586
587
588
589
def cv(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]] = None,
    nfold: int = 5,
    stratified: bool = True,
    shuffle: bool = True,
    metrics: Optional[Union[str, List[str]]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
590
591
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
592
593
594
595
    fpreproc: Optional[_LGBM_PreprocFunction] = None,
    seed: int = 0,
    callbacks: Optional[List[Callable]] = None,
    eval_train_metric: bool = False,
596
    return_cvbooster: bool = False,
597
) -> Dict[str, Union[List[float], CVBooster]]:
Andrew Ziem's avatar
Andrew Ziem committed
598
    """Perform the cross-validation with given parameters.
wxchan's avatar
wxchan committed
599
600
601
602

    Parameters
    ----------
    params : dict
603
604
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
605
    train_set : Dataset
606
        Data to be trained on.
607
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
608
        Number of boosting iterations.
609
    folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
610
        If generator or iterator, it should yield the train and test indices for each fold.
611
        If object, it should be one of the scikit-learn splitter classes
612
        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
613
        and have ``split`` method.
614
        This argument has highest priority over other data split arguments.
615
    nfold : int, optional (default=5)
wxchan's avatar
wxchan committed
616
        Number of folds in CV.
617
618
    stratified : bool, optional (default=True)
        Whether to perform stratified sampling.
619
    shuffle : bool, optional (default=True)
620
        Whether to shuffle before splitting data.
621
    metrics : str, list of str, or None, optional (default=None)
622
623
        Evaluation metrics to be monitored while CV.
        If not None, the metric in ``params`` will be overridden.
624
    feval : callable, list of callable, or None, optional (default=None)
625
        Customized evaluation function.
626
        Each evaluation function should accept two parameters: preds, eval_data,
627
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
628

629
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
630
                The predicted values.
631
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
632
                If custom objective function is used, predicted values are returned before any transformation,
633
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
634
635
            eval_data : Dataset
                A ``Dataset`` to evaluate.
636
            eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
637
                The name of evaluation function (without whitespace).
638
639
640
641
642
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

643
644
        To ignore the default metric corresponding to the used objective,
        set ``metrics`` to the string ``"None"``.
645
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
646
        Filename of LightGBM model or Booster instance used for continue training.
647
    feature_name : list of str, or 'auto', optional (default="auto")
648
        **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
649
650
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
651
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
652
        **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
653
654
        Categorical features.
        If list of int, interpreted as indices.
655
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
656
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
657
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
658
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
659
        All negative values in categorical features will be treated as missing values.
660
        The output cannot be monotonically constrained with respect to a categorical feature.
661
        Floating point numbers in categorical features will be rounded towards 0.
662
663
    fpreproc : callable or None, optional (default=None)
        Preprocessing function that takes (dtrain, dtest, params)
wxchan's avatar
wxchan committed
664
        and returns transformed versions of those.
665
    seed : int, optional (default=0)
wxchan's avatar
wxchan committed
666
        Seed used to generate the folds (passed to numpy.random.seed).
667
    callbacks : list of callable, or None, optional (default=None)
668
        List of callback functions that are applied at each iteration.
669
        See Callbacks in Python API for more information.
670
671
672
    eval_train_metric : bool, optional (default=False)
        Whether to display the train metric in progress.
        The score of the metric is calculated again after each training step, so there is some impact on performance.
673
674
    return_cvbooster : bool, optional (default=False)
        Whether to return Booster models trained on each fold through ``CVBooster``.
wxchan's avatar
wxchan committed
675

676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
697
698
    Returns
    -------
699
700
    eval_results : dict
        History of evaluation results of each metric.
701
        The dictionary has the following format:
702
703
        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],
        'valid metric2-mean': [values], 'valid metric2-stdv': [values],
704
        ...}.
705
        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.
706
707
708
709
710
        If ``eval_train_metric=True``, also returns the train metric history.
        In this case, the dictionary has the following format:
        {'train metric1-mean': [values], 'valid metric1-mean': [values],
        'train metric2-mean': [values], 'valid metric2-mean': [values],
        ...}.
wxchan's avatar
wxchan committed
711
    """
Guolin Ke's avatar
Guolin Ke committed
712
    if not isinstance(train_set, Dataset):
713
714
715
716
717
        raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if num_boost_round <= 0:
        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")

718
719
720
721
722
723
724
    # raise deprecation warnings if necessary
    # ref: https://github.com/microsoft/LightGBM/issues/6435
    if categorical_feature != "auto":
        _emit_dataset_kwarg_warning("cv", "categorical_feature")
    if feature_name != "auto":
        _emit_dataset_kwarg_warning("cv", "feature_name")

725
    params = copy.deepcopy(params)
726
    params = _choose_param_value(
727
        main_param_name="objective",
728
        params=params,
729
        default_value=None,
730
    )
731
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
732
733
    if callable(params["objective"]):
        fobj = params["objective"]
734
        params["objective"] = "none"
735
    for alias in _ConfigAliases.get("num_iterations"):
736
        if alias in params:
737
            _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument")
738
            num_boost_round = params.pop(alias)
739
    params["num_iterations"] = num_boost_round
740
741
742
743
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
744
        default_value=None,
745
746
747
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
748
    first_metric_only = params.get("first_metric_only", False)
749

750
    if isinstance(init_model, (str, Path)):
751
752
        predictor = _InnerPredictor.from_model_file(
            model_file=init_model,
753
            pred_parameter=params,
754
        )
Guolin Ke's avatar
Guolin Ke committed
755
    elif isinstance(init_model, Booster):
756
757
        predictor = _InnerPredictor.from_booster(
            booster=init_model,
758
            pred_parameter=dict(init_model.params, **params),
759
        )
Guolin Ke's avatar
Guolin Ke committed
760
761
762
    else:
        predictor = None

Peter's avatar
Peter committed
763
    if metrics is not None:
764
765
        for metric_alias in _ConfigAliases.get("metric"):
            params.pop(metric_alias, None)
766
        params["metric"] = metrics
wxchan's avatar
wxchan committed
767

768
769
770
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
771

772
    results = defaultdict(list)
773
774
775
776
777
778
779
780
781
782
783
    cvfolds = _make_n_folds(
        full_data=train_set,
        folds=folds,
        nfold=nfold,
        params=params,
        seed=seed,
        fpreproc=fpreproc,
        stratified=stratified,
        shuffle=shuffle,
        eval_train_metric=eval_train_metric,
    )
wxchan's avatar
wxchan committed
784
785

    # setup callbacks
786
    if callbacks is None:
787
        callbacks_set = set()
wxchan's avatar
wxchan committed
788
789
    else:
        for i, cb in enumerate(callbacks):
790
            cb.__dict__.setdefault("order", i - len(callbacks))
791
        callbacks_set = set(callbacks)
792

793
    if callback._should_enable_early_stopping(params.get("early_stopping_round", 0)):
794
        callbacks_set.add(
795
            callback.early_stopping(
796
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
797
                first_metric_only=first_metric_only,
798
                min_delta=params.get("early_stopping_min_delta", 0.0),
799
800
801
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
802
803
804
                    default_value=1,
                ).pop("verbosity")
                > 0,
805
806
            )
        )
wxchan's avatar
wxchan committed
807

808
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
809
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
810
811
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
812

813
    for i in range(num_boost_round):
wxchan's avatar
wxchan committed
814
        for cb in callbacks_before_iter:
815
816
817
818
819
820
821
822
823
824
            cb(
                callback.CallbackEnv(
                    model=cvfolds,
                    params=params,
                    iteration=i,
                    begin_iteration=0,
                    end_iteration=num_boost_round,
                    evaluation_result_list=None,
                )
            )
825
826
        cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
        res = _agg_cv_result(cvfolds.eval_valid(feval))  # type: ignore[call-arg]
wxchan's avatar
wxchan committed
827
        for _, key, mean, _, std in res:
828
829
            results[f"{key}-mean"].append(mean)
            results[f"{key}-stdv"].append(std)
wxchan's avatar
wxchan committed
830
831
        try:
            for cb in callbacks_after_iter:
832
833
834
835
836
837
838
839
840
841
                cb(
                    callback.CallbackEnv(
                        model=cvfolds,
                        params=params,
                        iteration=i,
                        begin_iteration=0,
                        end_iteration=num_boost_round,
                        evaluation_result_list=res,
                    )
                )
842
843
        except callback.EarlyStopException as earlyStopException:
            cvfolds.best_iteration = earlyStopException.best_iteration + 1
844
845
            for bst in cvfolds.boosters:
                bst.best_iteration = cvfolds.best_iteration
wxchan's avatar
wxchan committed
846
            for k in results:
847
                results[k] = results[k][: cvfolds.best_iteration]
wxchan's avatar
wxchan committed
848
            break
849
850

    if return_cvbooster:
851
        results["cvbooster"] = cvfolds  # type: ignore[assignment]
852

wxchan's avatar
wxchan committed
853
    return dict(results)