engine.py 34.4 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Library with training routines of LightGBM."""
3
import copy
4
import json
5
from collections import OrderedDict, defaultdict
wxchan's avatar
wxchan committed
6
from operator import attrgetter
7
from pathlib import Path
8
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
9

wxchan's avatar
wxchan committed
10
import numpy as np
11

wxchan's avatar
wxchan committed
12
from . import callback
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from .basic import (
    Booster,
    Dataset,
    LightGBMError,
    _choose_param_value,
    _ConfigAliases,
    _InnerPredictor,
    _LGBM_BoosterEvalMethodResultType,
    _LGBM_BoosterEvalMethodResultWithStandardDeviationType,
    _LGBM_CategoricalFeatureConfiguration,
    _LGBM_CustomObjectiveFunction,
    _LGBM_EvalFunctionResultType,
    _LGBM_FeatureNameConfiguration,
    _log_warning,
)
28
from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
wxchan's avatar
wxchan committed
29

30
__all__ = [
31
32
33
    "cv",
    "CVBooster",
    "train",
34
35
36
]


37
38
39
40
41
42
43
_LGBM_CustomMetricFunction = Union[
    Callable[
        [np.ndarray, Dataset],
        _LGBM_EvalFunctionResultType,
    ],
    Callable[
        [np.ndarray, Dataset],
44
        List[_LGBM_EvalFunctionResultType],
45
    ],
46
]
wxchan's avatar
wxchan committed
47

48
49
_LGBM_PreprocFunction = Callable[
    [Dataset, Dataset, Dict[str, Any]],
50
    Tuple[Dataset, Dataset, Dict[str, Any]],
51
52
]

53
54
55
56
57
58
59
60
61

def train(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    valid_sets: Optional[List[Dataset]] = None,
    valid_names: Optional[List[str]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
62
63
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
64
    keep_training_booster: bool = False,
65
    callbacks: Optional[List[Callable]] = None,
66
) -> Booster:
67
    """Perform the training with given parameters.
wxchan's avatar
wxchan committed
68
69
70
71

    Parameters
    ----------
    params : dict
72
73
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
74
    train_set : Dataset
75
76
        Data to be trained on.
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
77
        Number of boosting iterations.
78
    valid_sets : list of Dataset, or None, optional (default=None)
79
        List of data to be evaluated on during training.
80
    valid_names : list of str, or None, optional (default=None)
81
        Names of ``valid_sets``.
82
    feval : callable, list of callable, or None, optional (default=None)
wxchan's avatar
wxchan committed
83
        Customized evaluation function.
Akshita Dixit's avatar
Akshita Dixit committed
84
        Each evaluation function should accept two parameters: preds, eval_data,
85
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
86

87
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
88
                The predicted values.
89
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
90
                If custom objective function is used, predicted values are returned before any transformation,
91
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
Akshita Dixit's avatar
Akshita Dixit committed
92
            eval_data : Dataset
93
                A ``Dataset`` to evaluate.
94
            eval_name : str
95
                The name of evaluation function (without whitespaces).
96
97
98
99
100
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

101
102
        To ignore the default metric corresponding to the used objective,
        set the ``metric`` parameter to the string ``"None"`` in ``params``.
103
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
104
        Filename of LightGBM model or Booster instance used for continue training.
105
    feature_name : list of str, or 'auto', optional (default="auto")
106
107
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
108
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
109
110
        Categorical features.
        If list of int, interpreted as indices.
111
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
112
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
113
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
114
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
115
        All negative values in categorical features will be treated as missing values.
116
        The output cannot be monotonically constrained with respect to a categorical feature.
117
        Floating point numbers in categorical features will be rounded towards 0.
118
119
120
    keep_training_booster : bool, optional (default=False)
        Whether the returned Booster will be used to keep training.
        If False, the returned value will be converted into _InnerPredictor before returning.
121
        This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.
122
123
        When your model is very large and cause the memory error,
        you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.
124
        You can still use _InnerPredictor as ``init_model`` for future continue training.
125
    callbacks : list of callable, or None, optional (default=None)
126
        List of callback functions that are applied at each iteration.
127
        See Callbacks in Python API for more information.
wxchan's avatar
wxchan committed
128

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
150
151
    Returns
    -------
152
153
    booster : Booster
        The trained Booster model.
wxchan's avatar
wxchan committed
154
    """
155
156
157
158
159
160
161
162
163
164
165
166
167
168
    if not isinstance(train_set, Dataset):
        raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if num_boost_round <= 0:
        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")

    if isinstance(valid_sets, list):
        for i, valid_item in enumerate(valid_sets):
            if not isinstance(valid_item, Dataset):
                raise TypeError(
                    "Every item in valid_sets must be a Dataset object. "
                    f"Item {i} has type '{type(valid_item).__name__}'."
                )

169
    # create predictor first
170
    params = copy.deepcopy(params)
171
    params = _choose_param_value(
172
        main_param_name="objective",
173
        params=params,
174
        default_value=None,
175
    )
176
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
177
178
    if callable(params["objective"]):
        fobj = params["objective"]
179
        params["objective"] = "none"
180
    for alias in _ConfigAliases.get("num_iterations"):
181
        if alias in params:
182
            num_boost_round = params.pop(alias)
183
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
184
    params["num_iterations"] = num_boost_round
185
186
187
188
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
189
        default_value=None,
190
191
192
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
193
    first_metric_only = params.get("first_metric_only", False)
194

195
    predictor: Optional[_InnerPredictor] = None
196
    if isinstance(init_model, (str, Path)):
197
        predictor = _InnerPredictor.from_model_file(model_file=init_model, pred_parameter=params)
wxchan's avatar
wxchan committed
198
    elif isinstance(init_model, Booster):
199
        predictor = _InnerPredictor.from_booster(booster=init_model, pred_parameter=dict(init_model.params, **params))
200
201
202
203
204

    if predictor is not None:
        init_iteration = predictor.current_iteration()
    else:
        init_iteration = 0
Guolin Ke's avatar
Guolin Ke committed
205

206
207
208
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
Guolin Ke's avatar
Guolin Ke committed
209

wxchan's avatar
wxchan committed
210
211
    is_valid_contain_train = False
    train_data_name = "training"
Guolin Ke's avatar
Guolin Ke committed
212
    reduced_valid_sets = []
wxchan's avatar
wxchan committed
213
    name_valid_sets = []
214
    if valid_sets is not None:
Guolin Ke's avatar
Guolin Ke committed
215
216
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
217
        if isinstance(valid_names, str):
wxchan's avatar
wxchan committed
218
            valid_names = [valid_names]
Guolin Ke's avatar
Guolin Ke committed
219
        for i, valid_data in enumerate(valid_sets):
220
            # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
221
            if valid_data is train_set:
wxchan's avatar
wxchan committed
222
223
224
225
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
Nikita Titov's avatar
Nikita Titov committed
226
            reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set))
227
            if valid_names is not None and len(valid_names) > i:
wxchan's avatar
wxchan committed
228
229
                name_valid_sets.append(valid_names[i])
            else:
230
                name_valid_sets.append(f"valid_{i}")
231
    # process callbacks
232
    if callbacks is None:
233
        callbacks_set = set()
wxchan's avatar
wxchan committed
234
235
    else:
        for i, cb in enumerate(callbacks):
236
            cb.__dict__.setdefault("order", i - len(callbacks))
237
        callbacks_set = set(callbacks)
wxchan's avatar
wxchan committed
238

239
    if callback._should_enable_early_stopping(params.get("early_stopping_round", 0)):
240
241
        callbacks_set.add(
            callback.early_stopping(
242
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
243
                first_metric_only=first_metric_only,
244
                min_delta=params.get("early_stopping_min_delta", 0.0),
245
246
247
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
248
249
250
                    default_value=1,
                ).pop("verbosity")
                > 0,
251
252
            )
        )
253

254
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
255
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
256
257
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
258

259
    # construct booster
260
261
262
263
    try:
        booster = Booster(params=params, train_set=train_set)
        if is_valid_contain_train:
            booster.set_train_data_name(train_data_name)
264
        for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
265
266
267
268
269
            booster.add_valid(valid_set, name_valid_set)
    finally:
        train_set._reverse_update_params()
        for valid_set in reduced_valid_sets:
            valid_set._reverse_update_params()
270
    booster.best_iteration = 0
wxchan's avatar
wxchan committed
271

272
    # start training
273
    for i in range(init_iteration, init_iteration + num_boost_round):
wxchan's avatar
wxchan committed
274
        for cb in callbacks_before_iter:
275
276
277
278
279
280
281
282
283
284
            cb(
                callback.CallbackEnv(
                    model=booster,
                    params=params,
                    iteration=i,
                    begin_iteration=init_iteration,
                    end_iteration=init_iteration + num_boost_round,
                    evaluation_result_list=None,
                )
            )
wxchan's avatar
wxchan committed
285
286
287

        booster.update(fobj=fobj)

288
        evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
wxchan's avatar
wxchan committed
289
        # check evaluation result.
290
        if valid_sets is not None:
wxchan's avatar
wxchan committed
291
292
293
294
295
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
296
297
298
299
300
301
302
303
304
305
                cb(
                    callback.CallbackEnv(
                        model=booster,
                        params=params,
                        iteration=i,
                        begin_iteration=init_iteration,
                        end_iteration=init_iteration + num_boost_round,
                        evaluation_result_list=evaluation_result_list,
                    )
                )
306
307
        except callback.EarlyStopException as earlyStopException:
            booster.best_iteration = earlyStopException.best_iteration + 1
wxchan's avatar
wxchan committed
308
            evaluation_result_list = earlyStopException.best_score
wxchan's avatar
wxchan committed
309
            break
310
    booster.best_score = defaultdict(OrderedDict)
wxchan's avatar
wxchan committed
311
312
    for dataset_name, eval_name, score, _ in evaluation_result_list:
        booster.best_score[dataset_name][eval_name] = score
313
    if not keep_training_booster:
314
        booster.model_from_string(booster.model_to_string()).free_dataset()
wxchan's avatar
wxchan committed
315
316
317
    return booster


318
class CVBooster:
319
320
    """CVBooster in LightGBM.

321
    Auxiliary data structure to hold and redirect all boosters of ``cv()`` function.
322
    This class has the same methods as Booster class.
323
324
325
326
327
328
    All method calls, except for the following methods, are actually performed for underlying Boosters and
    then all returned results are returned in a list.

    - ``model_from_string()``
    - ``model_to_string()``
    - ``save_model()``
329
330
331
332
333
334
335
336

    Attributes
    ----------
    boosters : list of Booster
        The list of underlying fitted models.
    best_iteration : int
        The best iteration of fitted model.
    """
337

338
339
    def __init__(
        self,
340
        model_file: Optional[Union[str, Path]] = None,
341
    ):
342
343
        """Initialize the CVBooster.

344
345
346
347
        Parameters
        ----------
        model_file : str, pathlib.Path or None, optional (default=None)
            Path to the CVBooster model file.
348
        """
349
        self.boosters: List[Booster] = []
350
        self.best_iteration = -1
351

352
353
354
355
356
357
358
359
360
        if model_file is not None:
            with open(model_file, "r") as file:
                self._from_dict(json.load(file))

    def _from_dict(self, models: Dict[str, Any]) -> None:
        """Load CVBooster from dict."""
        self.best_iteration = models["best_iteration"]
        self.boosters = []
        for model_str in models["boosters"]:
361
            self.boosters.append(Booster(model_str=model_str))
362
363
364
365
366

    def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]:
        """Serialize CVBooster to dict."""
        models_str = []
        for booster in self.boosters:
367
368
369
370
371
            models_str.append(
                booster.model_to_string(
                    num_iteration=num_iteration, start_iteration=start_iteration, importance_type=importance_type
                )
            )
372
373
        return {"boosters": models_str, "best_iteration": self.best_iteration}

374
    def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
375
        """Redirect methods call of CVBooster."""
376

377
        def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
378
            """Call methods with each booster, and concatenate their results."""
379
380
381
382
            ret = []
            for booster in self.boosters:
                ret.append(getattr(booster, name)(*args, **kwargs))
            return ret
383

384
        return handler_function
wxchan's avatar
wxchan committed
385

386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
    def __getstate__(self) -> Dict[str, Any]:
        return vars(self)

    def __setstate__(self, state: Dict[str, Any]) -> None:
        vars(self).update(state)

    def model_from_string(self, model_str: str) -> "CVBooster":
        """Load CVBooster from a string.

        Parameters
        ----------
        model_str : str
            Model will be loaded from this string.

        Returns
        -------
        self : CVBooster
            Loaded CVBooster object.
        """
        self._from_dict(json.loads(model_str))
        return self

    def model_to_string(
        self,
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
412
        importance_type: str = "split",
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
    ) -> str:
        """Save CVBooster to JSON string.

        Parameters
        ----------
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        str_repr : str
            JSON string representation of CVBooster.
        """
        return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type))

    def save_model(
        self,
        filename: Union[str, Path],
        num_iteration: Optional[int] = None,
        start_iteration: int = 0,
441
        importance_type: str = "split",
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
    ) -> "CVBooster":
        """Save CVBooster to a file as JSON text.

        Parameters
        ----------
        filename : str or pathlib.Path
            Filename to save CVBooster.
        num_iteration : int or None, optional (default=None)
            Index of the iteration that should be saved.
            If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
            If <= 0, all iterations are saved.
        start_iteration : int, optional (default=0)
            Start index of the iteration that should be saved.
        importance_type : str, optional (default="split")
            What type of feature importance should be saved.
            If "split", result contains numbers of times the feature is used in a model.
            If "gain", result contains total gains of splits which use the feature.

        Returns
        -------
        self : CVBooster
            Returns self.
        """
        with open(filename, "w") as file:
            json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file)

        return self

470

471
472
473
474
475
476
def _make_n_folds(
    full_data: Dataset,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]],
    nfold: int,
    params: Dict[str, Any],
    seed: int,
477
478
479
    fpreproc: Optional[_LGBM_PreprocFunction],
    stratified: bool,
    shuffle: bool,
480
    eval_train_metric: bool,
481
) -> CVBooster:
482
    """Make a n-fold list of Booster from random indices."""
wxchan's avatar
wxchan committed
483
484
    full_data = full_data.construct()
    num_data = full_data.num_data()
485
    if folds is not None:
486
487
488
489
490
491
        if not hasattr(folds, "__iter__") and not hasattr(folds, "split"):
            raise AttributeError(
                "folds should be a generator or iterator of (train_idx, test_idx) tuples "
                "or scikit-learn splitter object with split method"
            )
        if hasattr(folds, "split"):
492
493
            group_info = full_data.get_group()
            if group_info is not None:
494
                group_info = np.array(group_info, dtype=np.int32, copy=False)
495
                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
496
            else:
497
                flatted_group = np.zeros(num_data, dtype=np.int32)
498
            folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
wxchan's avatar
wxchan committed
499
    else:
500
501
502
503
504
        if any(
            params.get(obj_alias, "")
            in {"lambdarank", "rank_xendcg", "xendcg", "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
            for obj_alias in _ConfigAliases.get("objective")
        ):
wxchan's avatar
wxchan committed
505
            if not SKLEARN_INSTALLED:
506
                raise LightGBMError("scikit-learn is required for ranking cv")
507
            # ranking task, split according to groups
508
            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
509
            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
510
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
511
            folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
wxchan's avatar
wxchan committed
512
513
        elif stratified:
            if not SKLEARN_INSTALLED:
514
                raise LightGBMError("scikit-learn is required for stratified cv")
515
            skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
516
            folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
extremin's avatar
extremin committed
517
        else:
wxchan's avatar
wxchan committed
518
519
520
521
522
            if shuffle:
                randidx = np.random.RandomState(seed).permutation(num_data)
            else:
                randidx = np.arange(num_data)
            kstep = int(num_data / nfold)
523
            test_id = [randidx[i : i + kstep] for i in range(0, num_data, kstep)]
524
525
            train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
            folds = zip(train_id, test_id)
wxchan's avatar
wxchan committed
526

527
    ret = CVBooster()
wxchan's avatar
wxchan committed
528
    for train_idx, test_idx in folds:
529
530
        train_set = full_data.subset(sorted(train_idx))
        valid_set = full_data.subset(sorted(test_idx))
wxchan's avatar
wxchan committed
531
532
        # run preprocessing on the data set if needed
        if fpreproc is not None:
wxchan's avatar
wxchan committed
533
            train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
wxchan's avatar
wxchan committed
534
        else:
wxchan's avatar
wxchan committed
535
            tparam = params
536
        booster_for_fold = Booster(tparam, train_set)
537
        if eval_train_metric:
538
539
            booster_for_fold.add_valid(train_set, "train")
        booster_for_fold.add_valid(valid_set, "valid")
540
        ret.boosters.append(booster_for_fold)
wxchan's avatar
wxchan committed
541
542
    return ret

wxchan's avatar
wxchan committed
543

544
def _agg_cv_result(
545
    raw_results: List[List[_LGBM_BoosterEvalMethodResultType]],
546
) -> List[_LGBM_BoosterEvalMethodResultWithStandardDeviationType]:
547
    """Aggregate cross-validation results."""
548
    cvmap: Dict[str, List[float]] = OrderedDict()
549
    metric_type: Dict[str, bool] = {}
wxchan's avatar
wxchan committed
550
551
    for one_result in raw_results:
        for one_line in one_result:
552
            key = f"{one_line[0]} {one_line[1]}"
553
            metric_type[key] = one_line[3]
554
            cvmap.setdefault(key, [])
555
            cvmap[key].append(one_line[2])
556
    return [("cv_agg", k, float(np.mean(v)), metric_type[k], float(np.std(v))) for k, v in cvmap.items()]
wxchan's avatar
wxchan committed
557

wxchan's avatar
wxchan committed
558

559
560
561
562
563
564
565
566
567
568
569
def cv(
    params: Dict[str, Any],
    train_set: Dataset,
    num_boost_round: int = 100,
    folds: Optional[Union[Iterable[Tuple[np.ndarray, np.ndarray]], _LGBMBaseCrossValidator]] = None,
    nfold: int = 5,
    stratified: bool = True,
    shuffle: bool = True,
    metrics: Optional[Union[str, List[str]]] = None,
    feval: Optional[Union[_LGBM_CustomMetricFunction, List[_LGBM_CustomMetricFunction]]] = None,
    init_model: Optional[Union[str, Path, Booster]] = None,
570
571
    feature_name: _LGBM_FeatureNameConfiguration = "auto",
    categorical_feature: _LGBM_CategoricalFeatureConfiguration = "auto",
572
573
574
575
    fpreproc: Optional[_LGBM_PreprocFunction] = None,
    seed: int = 0,
    callbacks: Optional[List[Callable]] = None,
    eval_train_metric: bool = False,
576
    return_cvbooster: bool = False,
577
) -> Dict[str, Union[List[float], CVBooster]]:
Andrew Ziem's avatar
Andrew Ziem committed
578
    """Perform the cross-validation with given parameters.
wxchan's avatar
wxchan committed
579
580
581
582

    Parameters
    ----------
    params : dict
583
584
        Parameters for training. Values passed through ``params`` take precedence over those
        supplied via arguments.
Guolin Ke's avatar
Guolin Ke committed
585
    train_set : Dataset
586
        Data to be trained on.
587
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
588
        Number of boosting iterations.
589
    folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
590
        If generator or iterator, it should yield the train and test indices for each fold.
591
        If object, it should be one of the scikit-learn splitter classes
592
        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
593
        and have ``split`` method.
594
        This argument has highest priority over other data split arguments.
595
    nfold : int, optional (default=5)
wxchan's avatar
wxchan committed
596
        Number of folds in CV.
597
598
    stratified : bool, optional (default=True)
        Whether to perform stratified sampling.
599
    shuffle : bool, optional (default=True)
600
        Whether to shuffle before splitting data.
601
    metrics : str, list of str, or None, optional (default=None)
602
603
        Evaluation metrics to be monitored while CV.
        If not None, the metric in ``params`` will be overridden.
604
    feval : callable, list of callable, or None, optional (default=None)
605
        Customized evaluation function.
606
        Each evaluation function should accept two parameters: preds, eval_data,
607
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
608

609
            preds : numpy 1-D array or numpy 2-D array (for multi-class task)
610
                The predicted values.
611
                For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes].
612
                If custom objective function is used, predicted values are returned before any transformation,
613
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
614
615
            eval_data : Dataset
                A ``Dataset`` to evaluate.
616
            eval_name : str
Andrew Ziem's avatar
Andrew Ziem committed
617
                The name of evaluation function (without whitespace).
618
619
620
621
622
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

623
624
        To ignore the default metric corresponding to the used objective,
        set ``metrics`` to the string ``"None"``.
625
    init_model : str, pathlib.Path, Booster or None, optional (default=None)
626
        Filename of LightGBM model or Booster instance used for continue training.
627
    feature_name : list of str, or 'auto', optional (default="auto")
628
629
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
630
    categorical_feature : list of str or int, or 'auto', optional (default="auto")
631
632
        Categorical features.
        If list of int, interpreted as indices.
633
        If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
634
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
635
        All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
636
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
637
        All negative values in categorical features will be treated as missing values.
638
        The output cannot be monotonically constrained with respect to a categorical feature.
639
        Floating point numbers in categorical features will be rounded towards 0.
640
641
    fpreproc : callable or None, optional (default=None)
        Preprocessing function that takes (dtrain, dtest, params)
wxchan's avatar
wxchan committed
642
        and returns transformed versions of those.
643
    seed : int, optional (default=0)
wxchan's avatar
wxchan committed
644
        Seed used to generate the folds (passed to numpy.random.seed).
645
    callbacks : list of callable, or None, optional (default=None)
646
        List of callback functions that are applied at each iteration.
647
        See Callbacks in Python API for more information.
648
649
650
    eval_train_metric : bool, optional (default=False)
        Whether to display the train metric in progress.
        The score of the metric is calculated again after each training step, so there is some impact on performance.
651
652
    return_cvbooster : bool, optional (default=False)
        Whether to return Booster models trained on each fold through ``CVBooster``.
wxchan's avatar
wxchan committed
653

654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
    Note
    ----
    A custom objective function can be provided for the ``objective`` parameter.
    It should accept two parameters: preds, train_data and return (grad, hess).

        preds : numpy 1-D array or numpy 2-D array (for multi-class task)
            The predicted values.
            Predicted values are returned before any transformation,
            e.g. they are raw margin instead of probability of positive class for binary task.
        train_data : Dataset
            The training dataset.
        grad : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the first order derivative (gradient) of the loss
            with respect to the elements of preds for each sample point.
        hess : numpy 1-D array or numpy 2-D array (for multi-class task)
            The value of the second order derivative (Hessian) of the loss
            with respect to the elements of preds for each sample point.

    For multi-class task, preds are numpy 2-D array of shape = [n_samples, n_classes],
    and grad and hess should be returned in the same format.

wxchan's avatar
wxchan committed
675
676
    Returns
    -------
677
678
    eval_results : dict
        History of evaluation results of each metric.
679
        The dictionary has the following format:
680
681
        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],
        'valid metric2-mean': [values], 'valid metric2-stdv': [values],
682
        ...}.
683
        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.
684
685
686
687
688
        If ``eval_train_metric=True``, also returns the train metric history.
        In this case, the dictionary has the following format:
        {'train metric1-mean': [values], 'valid metric1-mean': [values],
        'train metric2-mean': [values], 'valid metric2-mean': [values],
        ...}.
wxchan's avatar
wxchan committed
689
    """
Guolin Ke's avatar
Guolin Ke committed
690
    if not isinstance(train_set, Dataset):
691
692
693
694
695
        raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")

    if num_boost_round <= 0:
        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")

696
    params = copy.deepcopy(params)
697
    params = _choose_param_value(
698
        main_param_name="objective",
699
        params=params,
700
        default_value=None,
701
    )
702
    fobj: Optional[_LGBM_CustomObjectiveFunction] = None
703
704
    if callable(params["objective"]):
        fobj = params["objective"]
705
        params["objective"] = "none"
706
    for alias in _ConfigAliases.get("num_iterations"):
707
        if alias in params:
708
            _log_warning(f"Found '{alias}' in params. Will use it instead of 'num_boost_round' argument")
709
            num_boost_round = params.pop(alias)
710
    params["num_iterations"] = num_boost_round
711
712
713
714
    # setting early stopping via global params should be possible
    params = _choose_param_value(
        main_param_name="early_stopping_round",
        params=params,
715
        default_value=None,
716
717
718
    )
    if params["early_stopping_round"] is None:
        params.pop("early_stopping_round")
719
    first_metric_only = params.get("first_metric_only", False)
720

721
    if isinstance(init_model, (str, Path)):
722
723
        predictor = _InnerPredictor.from_model_file(
            model_file=init_model,
724
            pred_parameter=params,
725
        )
Guolin Ke's avatar
Guolin Ke committed
726
    elif isinstance(init_model, Booster):
727
728
        predictor = _InnerPredictor.from_booster(
            booster=init_model,
729
            pred_parameter=dict(init_model.params, **params),
730
        )
Guolin Ke's avatar
Guolin Ke committed
731
732
733
    else:
        predictor = None

Peter's avatar
Peter committed
734
    if metrics is not None:
735
736
        for metric_alias in _ConfigAliases.get("metric"):
            params.pop(metric_alias, None)
737
        params["metric"] = metrics
wxchan's avatar
wxchan committed
738

739
740
741
    train_set._update_params(params)._set_predictor(predictor).set_feature_name(feature_name).set_categorical_feature(
        categorical_feature
    )
742

743
    results = defaultdict(list)
744
745
746
747
748
749
750
751
752
753
754
    cvfolds = _make_n_folds(
        full_data=train_set,
        folds=folds,
        nfold=nfold,
        params=params,
        seed=seed,
        fpreproc=fpreproc,
        stratified=stratified,
        shuffle=shuffle,
        eval_train_metric=eval_train_metric,
    )
wxchan's avatar
wxchan committed
755
756

    # setup callbacks
757
    if callbacks is None:
758
        callbacks_set = set()
wxchan's avatar
wxchan committed
759
760
    else:
        for i, cb in enumerate(callbacks):
761
            cb.__dict__.setdefault("order", i - len(callbacks))
762
        callbacks_set = set(callbacks)
763

764
    if callback._should_enable_early_stopping(params.get("early_stopping_round", 0)):
765
        callbacks_set.add(
766
            callback.early_stopping(
767
                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
768
                first_metric_only=first_metric_only,
769
                min_delta=params.get("early_stopping_min_delta", 0.0),
770
771
772
                verbose=_choose_param_value(
                    main_param_name="verbosity",
                    params=params,
773
774
775
                    default_value=1,
                ).pop("verbosity")
                > 0,
776
777
            )
        )
wxchan's avatar
wxchan committed
778

779
    callbacks_before_iter_set = {cb for cb in callbacks_set if getattr(cb, "before_iteration", False)}
780
    callbacks_after_iter_set = callbacks_set - callbacks_before_iter_set
781
782
    callbacks_before_iter = sorted(callbacks_before_iter_set, key=attrgetter("order"))
    callbacks_after_iter = sorted(callbacks_after_iter_set, key=attrgetter("order"))
wxchan's avatar
wxchan committed
783

784
    for i in range(num_boost_round):
wxchan's avatar
wxchan committed
785
        for cb in callbacks_before_iter:
786
787
788
789
790
791
792
793
794
795
            cb(
                callback.CallbackEnv(
                    model=cvfolds,
                    params=params,
                    iteration=i,
                    begin_iteration=0,
                    end_iteration=num_boost_round,
                    evaluation_result_list=None,
                )
            )
796
797
        cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
        res = _agg_cv_result(cvfolds.eval_valid(feval))  # type: ignore[call-arg]
wxchan's avatar
wxchan committed
798
        for _, key, mean, _, std in res:
799
800
            results[f"{key}-mean"].append(mean)
            results[f"{key}-stdv"].append(std)
wxchan's avatar
wxchan committed
801
802
        try:
            for cb in callbacks_after_iter:
803
804
805
806
807
808
809
810
811
812
                cb(
                    callback.CallbackEnv(
                        model=cvfolds,
                        params=params,
                        iteration=i,
                        begin_iteration=0,
                        end_iteration=num_boost_round,
                        evaluation_result_list=res,
                    )
                )
813
814
        except callback.EarlyStopException as earlyStopException:
            cvfolds.best_iteration = earlyStopException.best_iteration + 1
815
816
            for bst in cvfolds.boosters:
                bst.best_iteration = cvfolds.best_iteration
wxchan's avatar
wxchan committed
817
            for k in results:
818
                results[k] = results[k][: cvfolds.best_iteration]
wxchan's avatar
wxchan committed
819
            break
820
821

    if return_cvbooster:
822
        results["cvbooster"] = cvfolds  # type: ignore[assignment]
823

wxchan's avatar
wxchan committed
824
    return dict(results)