engine.py 34.2 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Library with training routines of LightGBM."""
3
import copy
4
import json
5
from collections import OrderedDict, defaultdict
wxchan's avatar
wxchan committed
6
from operator import attrgetter
7
from pathlib import Path
8
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
9

wxchan's avatar
wxchan committed
10
import numpy as np
11

wxchan's avatar
wxchan committed
12
from . import callback
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from .basic import (
    Booster,
    Dataset,
    LightGBMError,
    _choose_param_value,
    _ConfigAliases,
    _InnerPredictor,
    _LGBM_BoosterEvalMethodResultType,
    _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
    _LGBM_CategoricalFeatureConfiguration,
    _LGBM_CustomObjectiveFunction,
    _LGBM_EvalFunctionResultType,
    _LGBM_FeatureNameConfiguration,
    _log_warning,
)
28
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
wxchan's avatar
wxchan committed
29

30
__all__ = [
31
32
33
    "cv",
    "CVBooster",
    "train",
34
35
36
]


37
38
39
40
41
42
43
_LGBM_CustomMetricFunction = Union[
    Callable[
        [np.ndarray, Dataset],
        _LGBM_EvalFunctionResultType,
    ],
    Callable[
        [np.ndarray, Dataset],
44
        List[_LGBM_EvalFunctionResultType],
45
    ],
46
]
wxchan's avatar
wxchan committed
47

48
49
_LGBM_PreprocFunction = Callable[
    [Dataset, Dataset, Dict[str, Any]],
50
    Tuple[Dataset, Dataset, Dict[str, Any]],
51
52
]

53
54
55
56
57
58
59
60
61

def train(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    valid_sets: Optional[List[Dataset]] = None,
    valid_names: Optional[List[str]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
62
63
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
64
    keep_training_booster: bool = False,
65
    callbacks: Optional[List[Callable]] = None,
66
) -> Booster:
67
    """Perform the training with given parameters.
wxchan's avatar
wxchan committed
68
69
70
71

    Parameters
    ----------
    params : dict
72
73
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
74
    train_set : Dataset
75
76
        Data to be trained on.
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
77
        Number of boosting iterations.
78
    valid_sets : list of Dataset, or None, optional (default=None)
79
        List of data to be evaluated on during training.
80
    valid_names : list of str, or None, optional (default=None)
81
        Names of ``valid_sets``.
82
    feval : callable, list of callable, or None, optional (default=None)
wxchan's avatar
wxchan committed
83
        Customized evaluation function.
Akshita Dixit's avatar
Akshita Dixit committed
84
        Each evaluation function should accept two parameters: preds, eval_data,
85
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
86

87
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
88
                The predicted values.
89
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
90
                If custom objective function is used, predicted values are returned before any transformation,
91
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
Akshita Dixit's avatar
Akshita Dixit committed
92
            eval_data : Dataset
93
                A ``Dataset`` to evaluate.
94
            eval_name : str
95
                The name of evaluation function (without whitespaces).
96
97
98
99
100
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

101
102
        To ignore the default metric corresponding to the used objective,
        set the ``metric`` parameter to the string ``"None"`` in ``params``.
103
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
104
        Filename of LightGBM model or Booster instance used for continue training.
105
    feature_name : list of str, or 'auto', optional (default="auto")
106
107
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
108
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
109
110
        Categorical features.
        If list of int, interpreted as indices.
111
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
112
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
113
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
114
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
115
        All negative values in categorical features will be treated as missing values.
116
        The output cannot be monotonically constrained with respect to a categorical feature.
117
        Floating point numbers in categorical features will be rounded towards 0.
118
119
120
    keep_training_booster : bool, optional (default=False)
        Whether the returned Booster will be used to keep training.
        If False, the returned value will be converted into _InnerPredictor before returning.
121
        This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.
122
123
        When your model is very large and cause the memory error,
        you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.
124
        You can still use _InnerPredictor as ``init_model`` for future continue training.
125
    callbacks : list of callable, or None, optional (default=None)
126
        List of callback functions that are applied at each iteration.
127
        See Callbacks in Python API for more information.
wxchan's avatar
wxchan committed
128

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
150
151
    Returns
    -------
152
153
    booster : Booster
        The trained Booster model.
wxchan's avatar
wxchan committed
154
    """
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    if not isinstance(train_set, Dataset):
        raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if num_boost_round <= 0:
        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")

    if isinstance(valid_sets, list):
        for i, valid_item in enumerate(valid_sets):
            if not isinstance(valid_item, Dataset):
                raise TypeError(
                    "Every item in valid_sets must be a Dataset object. "
                    f"Item {i} has type '{type(valid_item).__name__}'."
                )

169
    # create predictor first
170
    params = copy.deepcopy(params)
171
    params = _choose_param_value(
172
        main_param_name="objective",
173
        params=params,
174
        default_value=None,
175
    )
176
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
177
178
    if callable(params["objective"]):
        fobj = params["objective"]
179
        params["objective"] = "none"
180
    for alias in _ConfigAliases.get("num_iterations"):
181
        if alias in params:
182
            num_boost_round = params.pop(alias)
183
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
184
    params["num_iterations"] = num_boost_round
185
186
187
188
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
189
        default_value=None,
190
191
192
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
193
    first_metric_only = params.get("first_metric_only", False)
194

195
    predictor: Optional[_InnerPredictor] = None
196
    if isinstance(init_model, (str, Path)):
197
        predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params)
wxchan's avatar
wxchan committed
198
    elif isinstance(init_model, Booster):
199
        predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params))
200
201
202
203
204

    if predictor is not None:
        init_iteration = predictor.current_iteration()
    else:
        init_iteration = 0
Guolin Ke's avatar
Guolin Ke committed
205

206
207
208
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
Guolin Ke's avatar
Guolin Ke committed
209

wxchan's avatar
wxchan committed
210
211
    is_valid_contain_train = False
    train_data_name = "training"
Guolin Ke's avatar
Guolin Ke committed
212
    reduced_valid_sets = []
wxchan's avatar
wxchan committed
213
    name_valid_sets = []
214
    if valid_sets is not None:
Guolin Ke's avatar
Guolin Ke committed
215
216
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
217
        if isinstance(valid_names, str):
wxchan's avatar
wxchan committed
218
            valid_names = [valid_names]
Guolin Ke's avatar
Guolin Ke committed
219
        for i, valid_data in enumerate(valid_sets):
220
            # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
221
            if valid_data is train_set:
wxchan's avatar
wxchan committed
222
223
224
225
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
Nikita Titov's avatar
Nikita Titov committed
226
            reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set))
227
            if valid_names is not None and len(valid_names) > i:
wxchan's avatar
wxchan committed
228
229
                name_valid_sets.append(valid_names[i])
            else:
230
                name_valid_sets.append(f"valid_{i}")
231
    # process callbacks
232
    if callbacks is None:
233
        callbacks_set = set()
wxchan's avatar
wxchan committed
234
235
    else:
        for i, cb in enumerate(callbacks):
236
            cb.__dict__.setdefault("order", i - len(callbacks))
237
        callbacks_set = set(callbacks)
wxchan's avatar
wxchan committed
238

239
240
241
    if "early_stopping_round" in params:
        callbacks_set.add(
            callback.early_stopping(
242
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
243
244
245
246
                first_metric_only=first_metric_only,
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
247
248
249
                    default_value=1,
                ).pop("verbosity")
                > 0,
250
251
            )
        )
252

253
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
254
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
255
256
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
257

258
    # construct booster
259
260
261
262
    try:
        booster = Booster(params=params, train_set=train_set)
        if is_valid_contain_train:
            booster.set_train_data_name(train_data_name)
263
        for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
264
265
266
267
268
            booster.add_valid(valid_set, name_valid_set)
    finally:
        train_set._reverse_update_params()
        for valid_set in reduced_valid_sets:
            valid_set._reverse_update_params()
269
    booster.best_iteration = 0
wxchan's avatar
wxchan committed
270

271
    # start training
272
    for i in range(init_iteration, init_iteration + num_boost_round):
wxchan's avatar
wxchan committed
273
        for cb in callbacks_before_iter:
274
275
276
277
278
279
280
281
282
283
            cb(
                callback.CallbackEnv(
                    model=booster,
                    params=params,
                    iteration=i,
                    begin_iteration=init_iteration,
                    end_iteration=init_iteration + num_boost_round,
                    evaluation_result_list=None,
                )
            )
wxchan's avatar
wxchan committed
284
285
286

        booster.update(fobj=fobj)

287
        evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
wxchan's avatar
wxchan committed
288
        # check evaluation result.
289
        if valid_sets is not None:
wxchan's avatar
wxchan committed
290
291
292
293
294
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
295
296
297
298
299
300
301
302
303
304
                cb(
                    callback.CallbackEnv(
                        model=booster,
                        params=params,
                        iteration=i,
                        begin_iteration=init_iteration,
                        end_iteration=init_iteration + num_boost_round,
                        evaluation_result_list=evaluation_result_list,
                    )
                )
305
306
        except callback.EarlyStopException as earlyStopException:
            booster.best_iteration = earlyStopException.best_iteration + 1
wxchan's avatar
wxchan committed
307
            evaluation_result_list = earlyStopException.best_score
wxchan's avatar
wxchan committed
308
            break
309
    booster.best_score = defaultdict(OrderedDict)
wxchan's avatar
wxchan committed
310
311
    for dataset_name, eval_name, score, _ in evaluation_result_list:
        booster.best_score[dataset_name][eval_name] = score
312
    if not keep_training_booster:
313
        booster.model_from_string(booster.model_to_string()).free_dataset()
wxchan's avatar
wxchan committed
314
315
316
    return booster


317
class CVBooster:
318
319
    """CVBooster in LightGBM.

320
    Auxiliary data structure to hold and redirect all boosters of ``cv()`` function.
321
    This class has the same methods as Booster class.
322
323
324
325
326
327
    All method calls, except for the following methods, are actually performed for underlying Boosters and
    then all returned results are returned in a list.

    - ``model_from_string()``
    - ``model_to_string()``
    - ``save_model()``
328
329
330
331
332
333
334
335

    Attributes
    ----------
    boosters : list of Booster
        The list of underlying fitted models.
    best_iteration : int
        The best iteration of fitted model.
    """
336

337
338
    def __init__(
        self,
339
        model_file: Optional[Union[str, Path]] = None,
340
    ):
341
342
        """Initialize the CVBooster.

343
344
345
346
        Parameters
        ----------
        model_file : str, pathlib.Path or None, optional (default=None)
            Path to the CVBooster model file.
347
        """
348
        self.boosters: List[Booster] = []
349
        self.best_iteration = -1
350

351
352
353
354
355
356
357
358
359
        if model_file is not None:
            with open(model_file, "r") as file:
                self._from_dict(json.load(file))

    def _from_dict(self, models: Dict[str, Any]) -> None:
        """Load CVBooster from dict."""
        self.best_iteration = models["best_iteration"]
        self.boosters = []
        for model_str in models["boosters"]:
360
            self.boosters.append(Booster(model_str=model_str))
361
362
363
364
365

    def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]:
        """Serialize CVBooster to dict."""
        models_str = []
        for booster in self.boosters:
366
367
368
369
370
            models_str.append(
                booster.model_to_string(
                    num_iteration=num_iteration, start_iteration=start_iteration, importance_type=importance_type
                )
            )
371
372
        return {"boosters": models_str, "best_iteration": self.best_iteration}

373
    def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
374
        """Redirect methods call of CVBooster."""
375

376
        def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
377
            """Call methods with each booster, and concatenate their results."""
378
379
380
381
            ret = []
            for booster in self.boosters:
                ret.append(getattr(booster, name)(*args, **kwargs))
            return ret
382

383
        return handler_function
wxchan's avatar
wxchan committed
384

385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
    def __getstate__(self) -> Dict[str, Any]:
        return vars(self)

    def __setstate__(self, state: Dict[str, Any]) -> None:
        vars(self).update(state)

    def model_from_string(self, model_str: str) -> "CVBooster":
        """Load CVBooster from a string.

        Parameters
        ----------
        model_str : str
            Model will be loaded from this string.

        Returns
        -------
        self : CVBooster
            Loaded CVBooster object.
        """
        self._from_dict(json.loads(model_str))
        return self

    def model_to_string(
        self,
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
411
        importance_type: str = "split",
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
    ) -> str:
        """Save CVBooster to JSON string.

        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        str_repr : str
            JSON string representation of CVBooster.
        """
        return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type))

    def save_model(
        self,
        filename: Union[str, Path],
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
440
        importance_type: str = "split",
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
    ) -> "CVBooster":
        """Save CVBooster to a file as JSON text.

        Parameters
        ----------
        filename : str or pathlib.Path
            Filename to save CVBooster.
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        self : CVBooster
            Returns self.
        """
        with open(filename, "w") as file:
            json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file)

        return self

469

470
471
472
473
474
475
def _make_n_folds(
    full_data: Dataset,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]],
    nfold: int,
    params: Dict[str, Any],
    seed: int,
476
477
478
    fpreproc: Optional[_LGBM_PreprocFunction],
    stratified: bool,
    shuffle: bool,
479
    eval_train_metric: bool,
480
) -> CVBooster:
481
    """Make a n-fold list of Booster from random indices."""
wxchan's avatar
wxchan committed
482
483
    full_data = full_data.construct()
    num_data = full_data.num_data()
484
    if folds is not None:
485
486
487
488
489
490
        if not hasattr(folds, "__iter__") and not hasattr(folds, "split"):
            raise AttributeError(
                "folds should be a generator or iterator of (train_idx, test_idx) tuples "
                "or scikit-learn splitter object with split method"
            )
        if hasattr(folds, "split"):
491
492
            group_info = full_data.get_group()
            if group_info is not None:
493
                group_info = np.array(group_info, dtype=np.int32, copy=False)
494
                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
495
            else:
496
                flatted_group = np.zeros(num_data, dtype=np.int32)
497
            folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
wxchan's avatar
wxchan committed
498
    else:
499
500
501
502
503
        if any(
            params.get(obj_alias, "")
            in {"lambdarank", "rank_xendcg", "xendcg", "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
            for obj_alias in _ConfigAliases.get("objective")
        ):
wxchan's avatar
wxchan committed
504
            if not SKLEARN_INSTALLED:
505
                raise LightGBMError("scikit-learn is required for ranking cv")
506
            # ranking task, split according to groups
507
            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
508
            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
509
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
510
            folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
wxchan's avatar
wxchan committed
511
512
        elif stratified:
            if not SKLEARN_INSTALLED:
513
                raise LightGBMError("scikit-learn is required for stratified cv")
514
            skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
515
            folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
extremin's avatar
extremin committed
516
        else:
wxchan's avatar
wxchan committed
517
518
519
520
521
            if shuffle:
                randidx = np.random.RandomState(seed).permutation(num_data)
            else:
                randidx = np.arange(num_data)
            kstep = int(num_data / nfold)
522
            test_id = [randidx[i : i + kstep] for i in range(0, num_data, kstep)]
523
524
            train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
            folds = zip(train_id, test_id)
wxchan's avatar
wxchan committed
525

526
    ret = CVBooster()
wxchan's avatar
wxchan committed
527
    for train_idx, test_idx in folds:
528
529
        train_set = full_data.subset(sorted(train_idx))
        valid_set = full_data.subset(sorted(test_idx))
wxchan's avatar
wxchan committed
530
531
        # run preprocessing on the data set if needed
        if fpreproc is not None:
wxchan's avatar
wxchan committed
532
            train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
wxchan's avatar
wxchan committed
533
        else:
wxchan's avatar
wxchan committed
534
            tparam = params
535
        booster_for_fold = Booster(tparam, train_set)
536
        if eval_train_metric:
537
538
            booster_for_fold.add_valid(train_set, "train")
        booster_for_fold.add_valid(valid_set, "valid")
539
        ret.boosters.append(booster_for_fold)
wxchan's avatar
wxchan committed
540
541
    return ret

wxchan's avatar
wxchan committed
542

543
def _agg_cv_result(
544
    raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
545
) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
546
    """Aggregate cross-validation results."""
547
    cvmap: Dict[str, List[float]] = OrderedDict()
548
    metric_type: Dict[str, bool] = {}
wxchan's avatar
wxchan committed
549
550
    for one_result in raw_results:
        for one_line in one_result:
551
            key = f"{one_line[0]} {one_line[1]}"
552
            metric_type[key] = one_line[3]
553
            cvmap.setdefault(key, [])
554
            cvmap[key].append(one_line[2])
555
    return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
wxchan's avatar
wxchan committed
556

wxchan's avatar
wxchan committed
557

558
559
560
561
562
563
564
565
566
567
568
def cv(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]] = None,
    nfold: int = 5,
    stratified: bool = True,
    shuffle: bool = True,
    metrics: Optional[Union[str, List[str]]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
569
570
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
571
572
573
574
    fpreproc: Optional[_LGBM_PreprocFunction] = None,
    seed: int = 0,
    callbacks: Optional[List[Callable]] = None,
    eval_train_metric: bool = False,
575
    return_cvbooster: bool = False,
576
) -> Dict[str, Union[List[float], CVBooster]]:
Andrew Ziem's avatar
Andrew Ziem committed
577
    """Perform the cross-validation with given parameters.
wxchan's avatar
wxchan committed
578
579
580
581

    Parameters
    ----------
    params : dict
582
583
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
584
    train_set : Dataset
585
        Data to be trained on.
586
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
587
        Number of boosting iterations.
588
    folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
589
        If generator or iterator, it should yield the train and test indices for each fold.
590
        If object, it should be one of the scikit-learn splitter classes
591
        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
592
        and have ``split`` method.
593
        This argument has highest priority over other data split arguments.
594
    nfold : int, optional (default=5)
wxchan's avatar
wxchan committed
595
        Number of folds in CV.
596
597
    stratified : bool, optional (default=True)
        Whether to perform stratified sampling.
598
    shuffle : bool, optional (default=True)
599
        Whether to shuffle before splitting data.
600
    metrics : str, list of str, or None, optional (default=None)
601
602
        Evaluation metrics to be monitored while CV.
        If not None, the metric in ``params`` will be overridden.
603
    feval : callable, list of callable, or None, optional (default=None)
604
        Customized evaluation function.
605
        Each evaluation function should accept two parameters: preds, eval_data,
606
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
607

608
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
609
                The predicted values.
610
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
611
                If custom objective function is used, predicted values are returned before any transformation,
612
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
613
614
            eval_data : Dataset
                A ``Dataset`` to evaluate.
615
            eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
616
                The name of evaluation function (without whitespace).
617
618
619
620
621
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

622
623
        To ignore the default metric corresponding to the used objective,
        set ``metrics`` to the string ``"None"``.
624
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
625
        Filename of LightGBM model or Booster instance used for continue training.
626
    feature_name : list of str, or 'auto', optional (default="auto")
627
628
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
629
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
630
631
        Categorical features.
        If list of int, interpreted as indices.
632
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
633
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
634
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
635
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
636
        All negative values in categorical features will be treated as missing values.
637
        The output cannot be monotonically constrained with respect to a categorical feature.
638
        Floating point numbers in categorical features will be rounded towards 0.
639
640
    fpreproc : callable or None, optional (default=None)
        Preprocessing function that takes (dtrain, dtest, params)
wxchan's avatar
wxchan committed
641
        and returns transformed versions of those.
642
    seed : int, optional (default=0)
wxchan's avatar
wxchan committed
643
        Seed used to generate the folds (passed to numpy.random.seed).
644
    callbacks : list of callable, or None, optional (default=None)
645
        List of callback functions that are applied at each iteration.
646
        See Callbacks in Python API for more information.
647
648
649
    eval_train_metric : bool, optional (default=False)
        Whether to display the train metric in progress.
        The score of the metric is calculated again after each training step, so there is some impact on performance.
650
651
    return_cvbooster : bool, optional (default=False)
        Whether to return Booster models trained on each fold through ``CVBooster``.
wxchan's avatar
wxchan committed
652

653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
674
675
    Returns
    -------
676
677
    eval_results : dict
        History of evaluation results of each metric.
678
        The dictionary has the following format:
679
680
        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],
        'valid metric2-mean': [values], 'valid metric2-stdv': [values],
681
        ...}.
682
        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.
683
684
685
686
687
        If ``eval_train_metric=True``, also returns the train metric history.
        In this case, the dictionary has the following format:
        {'train metric1-mean': [values], 'valid metric1-mean': [values],
        'train metric2-mean': [values], 'valid metric2-mean': [values],
        ...}.
wxchan's avatar
wxchan committed
688
    """
Guolin Ke's avatar
Guolin Ke committed
689
    if not isinstance(train_set, Dataset):
690
691
692
693
694
        raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if num_boost_round <= 0:
        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")

695
    params = copy.deepcopy(params)
696
    params = _choose_param_value(
697
        main_param_name="objective",
698
        params=params,
699
        default_value=None,
700
    )
701
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
702
703
    if callable(params["objective"]):
        fobj = params["objective"]
704
        params["objective"] = "none"
705
    for alias in _ConfigAliases.get("num_iterations"):
706
        if alias in params:
707
            _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument")
708
            num_boost_round = params.pop(alias)
709
    params["num_iterations"] = num_boost_round
710
711
712
713
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
714
        default_value=None,
715
716
717
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
718
    first_metric_only = params.get("first_metric_only", False)
719

720
    if isinstance(init_model, (str, Path)):
721
722
        predictor = _InnerPredictor.from_model_file(
            model_file=init_model,
723
            pred_parameter=params,
724
        )
Guolin Ke's avatar
Guolin Ke committed
725
    elif isinstance(init_model, Booster):
726
727
        predictor = _InnerPredictor.from_booster(
            booster=init_model,
728
            pred_parameter=dict(init_model.params, **params),
729
        )
Guolin Ke's avatar
Guolin Ke committed
730
731
732
    else:
        predictor = None

Peter's avatar
Peter committed
733
    if metrics is not None:
734
735
        for metric_alias in _ConfigAliases.get("metric"):
            params.pop(metric_alias, None)
736
        params["metric"] = metrics
wxchan's avatar
wxchan committed
737

738
739
740
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
741

742
    results = defaultdict(list)
743
744
745
746
747
748
749
750
751
752
753
    cvfolds = _make_n_folds(
        full_data=train_set,
        folds=folds,
        nfold=nfold,
        params=params,
        seed=seed,
        fpreproc=fpreproc,
        stratified=stratified,
        shuffle=shuffle,
        eval_train_metric=eval_train_metric,
    )
wxchan's avatar
wxchan committed
754
755

    # setup callbacks
756
    if callbacks is None:
757
        callbacks_set = set()
wxchan's avatar
wxchan committed
758
759
    else:
        for i, cb in enumerate(callbacks):
760
            cb.__dict__.setdefault("order", i - len(callbacks))
761
        callbacks_set = set(callbacks)
762
763

    if "early_stopping_round" in params:
764
        callbacks_set.add(
765
            callback.early_stopping(
766
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
767
768
769
770
                first_metric_only=first_metric_only,
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
771
772
773
                    default_value=1,
                ).pop("verbosity")
                > 0,
774
775
            )
        )
wxchan's avatar
wxchan committed
776

777
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
778
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
779
780
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
781

782
    for i in range(num_boost_round):
wxchan's avatar
wxchan committed
783
        for cb in callbacks_before_iter:
784
785
786
787
788
789
790
791
792
793
            cb(
                callback.CallbackEnv(
                    model=cvfolds,
                    params=params,
                    iteration=i,
                    begin_iteration=0,
                    end_iteration=num_boost_round,
                    evaluation_result_list=None,
                )
            )
794
795
        cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
        res = _agg_cv_result(cvfolds.eval_valid(feval))  # type: ignore[call-arg]
wxchan's avatar
wxchan committed
796
        for _, key, mean, _, std in res:
797
798
            results[f"{key}-mean"].append(mean)
            results[f"{key}-stdv"].append(std)
wxchan's avatar
wxchan committed
799
800
        try:
            for cb in callbacks_after_iter:
801
802
803
804
805
806
807
808
809
810
                cb(
                    callback.CallbackEnv(
                        model=cvfolds,
                        params=params,
                        iteration=i,
                        begin_iteration=0,
                        end_iteration=num_boost_round,
                        evaluation_result_list=res,
                    )
                )
811
812
        except callback.EarlyStopException as earlyStopException:
            cvfolds.best_iteration = earlyStopException.best_iteration + 1
813
814
            for bst in cvfolds.boosters:
                bst.best_iteration = cvfolds.best_iteration
wxchan's avatar
wxchan committed
815
            for k in results:
816
                results[k] = results[k][: cvfolds.best_iteration]
wxchan's avatar
wxchan committed
817
            break
818
819

    if return_cvbooster:
820
        results["cvbooster"] = cvfolds  # type: ignore[assignment]
821

wxchan's avatar
wxchan committed
822
    return dict(results)