engine.py 29.8 KB
Newer Older
wxchan's avatar
wxchan committed
1
# coding: utf-8
2
"""Library with training routines of LightGBM."""
wxchan's avatar
wxchan committed
3
import collections
4
import copy
wxchan's avatar
wxchan committed
5
from operator import attrgetter
6

wxchan's avatar
wxchan committed
7
import numpy as np
8

wxchan's avatar
wxchan committed
9
from . import callback
10
from .basic import Booster, Dataset, LightGBMError, _ConfigAliases, _InnerPredictor, _log_warning
11
from .compat import SKLEARN_INSTALLED, _LGBMGroupKFold, _LGBMStratifiedKFold
wxchan's avatar
wxchan committed
12

wxchan's avatar
wxchan committed
13

Guolin Ke's avatar
Guolin Ke committed
14
15
def train(params, train_set, num_boost_round=100,
          valid_sets=None, valid_names=None,
wxchan's avatar
wxchan committed
16
          fobj=None, feval=None, init_model=None,
17
          feature_name='auto', categorical_feature='auto',
wxchan's avatar
wxchan committed
18
          early_stopping_rounds=None, evals_result=None,
19
20
          verbose_eval=True, learning_rates=None,
          keep_training_booster=False, callbacks=None):
21
    """Perform the training with given parameters.
wxchan's avatar
wxchan committed
22
23
24
25

    Parameters
    ----------
    params : dict
26
        Parameters for training.
Guolin Ke's avatar
Guolin Ke committed
27
    train_set : Dataset
28
29
        Data to be trained on.
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
30
        Number of boosting iterations.
31
32
33
    valid_sets : list of Datasets or None, optional (default=None)
        List of data to be evaluated on during training.
    valid_names : list of strings or None, optional (default=None)
34
35
        Names of ``valid_sets``.
    fobj : callable or None, optional (default=None)
wxchan's avatar
wxchan committed
36
        Customized objective function.
37
38
39
40
41
        Should accept two parameters: preds, train_data,
        and return (grad, hess).

            preds : list or numpy 1-D array
                The predicted values.
42
43
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
44
45
46
            train_data : Dataset
                The training dataset.
            grad : list or numpy 1-D array
47
48
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of preds for each sample point.
49
            hess : list or numpy 1-D array
50
51
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of preds for each sample point.
52
53
54
55
56

        For multi-class task, the preds is group by class_id first, then group by row_id.
        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
        and you should group grad and hess in this way as well.

57
    feval : callable, list of callable functions or None, optional (default=None)
wxchan's avatar
wxchan committed
58
        Customized evaluation function.
59
        Each evaluation function should accept two parameters: preds, train_data,
60
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
61
62
63

            preds : list or numpy 1-D array
                The predicted values.
64
65
                If ``fobj`` is specified, predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
66
67
68
            train_data : Dataset
                The training dataset.
            eval_name : string
69
                The name of evaluation function (without whitespaces).
70
71
72
73
74
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

75
76
        For multi-class task, the preds is group by class_id first, then group by row_id.
        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
77
78
        To ignore the default metric corresponding to the used objective,
        set the ``metric`` parameter to the string ``"None"`` in ``params``.
79
    init_model : string, Booster or None, optional (default=None)
80
81
82
83
84
85
86
87
        Filename of LightGBM model or Booster instance used for continue training.
    feature_name : list of strings or 'auto', optional (default="auto")
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
    categorical_feature : list of strings or int, or 'auto', optional (default="auto")
        Categorical features.
        If list of int, interpreted as indices.
        If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
88
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
89
        All values in categorical features should be less than int32 max value (2147483647).
90
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
91
        All negative values in categorical features will be treated as missing values.
92
        The output cannot be monotonically constrained with respect to a categorical feature.
93
    early_stopping_rounds : int or None, optional (default=None)
94
        Activates early stopping. The model will train until the validation score stops improving.
95
96
97
98
        Validation score needs to improve at least every ``early_stopping_rounds`` round(s)
        to continue training.
        Requires at least one validation data and one metric.
        If there's more than one, will check all of them. But the training data is ignored anyway.
99
        To check only the first metric, set the ``first_metric_only`` parameter to ``True`` in ``params``.
100
101
102
        The index of iteration that has the best performance will be saved in the ``best_iteration`` field
        if early stopping logic is enabled by setting ``early_stopping_rounds``.
    evals_result: dict or None, optional (default=None)
103
104
        This dictionary used to store all evaluation results of all the items in ``valid_sets``.

Nikita Titov's avatar
Nikita Titov committed
105
106
        .. rubric:: Example

107
108
        With a ``valid_sets`` = [valid_set, train_set],
        ``valid_names`` = ['eval', 'train']
109
110
        and a ``params`` = {'metric': 'logloss'}
        returns {'train': {'logloss': ['0.48253', '0.35953', ...]},
111
        'eval': {'logloss': ['0.480385', '0.357756', ...]}}.
112

113
114
115
116
117
118
    verbose_eval : bool or int, optional (default=True)
        Requires at least one validation data.
        If True, the eval metric on the valid set is printed at each boosting stage.
        If int, the eval metric on the valid set is printed at every ``verbose_eval`` boosting stage.
        The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed.

Nikita Titov's avatar
Nikita Titov committed
119
120
        .. rubric:: Example

121
        With ``verbose_eval`` = 4 and at least one item in ``valid_sets``,
122
        an evaluation metric is printed every 4 (instead of 1) boosting stages.
123
124

    learning_rates : list, callable or None, optional (default=None)
125
126
127
128
129
130
        List of learning rates for each boosting round
        or a customized function that calculates ``learning_rate``
        in terms of current number of round (e.g. yields learning rate decay).
    keep_training_booster : bool, optional (default=False)
        Whether the returned Booster will be used to keep training.
        If False, the returned value will be converted into _InnerPredictor before returning.
131
        This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.
132
133
        When your model is very large and cause the memory error,
        you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.
134
135
        You can still use _InnerPredictor as ``init_model`` for future continue training.
    callbacks : list of callables or None, optional (default=None)
136
        List of callback functions that are applied at each iteration.
137
        See Callbacks in Python API for more information.
wxchan's avatar
wxchan committed
138
139
140

    Returns
    -------
141
142
    booster : Booster
        The trained Booster model.
wxchan's avatar
wxchan committed
143
    """
144
    # create predictor first
145
    params = copy.deepcopy(params)
146
    if fobj is not None:
147
148
        for obj_alias in _ConfigAliases.get("objective"):
            params.pop(obj_alias, None)
149
        params['objective'] = 'none'
150
    for alias in _ConfigAliases.get("num_iterations"):
151
        if alias in params:
152
            num_boost_round = params.pop(alias)
153
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
154
    params["num_iterations"] = num_boost_round
155
    for alias in _ConfigAliases.get("early_stopping_round"):
156
157
        if alias in params:
            early_stopping_rounds = params.pop(alias)
158
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
159
160
    params["early_stopping_round"] = early_stopping_rounds
    first_metric_only = params.get('first_metric_only', False)
161

162
163
    if num_boost_round <= 0:
        raise ValueError("num_boost_round should be greater than zero.")
164
    if isinstance(init_model, str):
165
        predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
wxchan's avatar
wxchan committed
166
    elif isinstance(init_model, Booster):
167
        predictor = init_model._to_predictor(dict(init_model.params, **params))
wxchan's avatar
wxchan committed
168
169
    else:
        predictor = None
170
    init_iteration = predictor.num_total_iteration if predictor is not None else 0
171
    # check dataset
Guolin Ke's avatar
Guolin Ke committed
172
    if not isinstance(train_set, Dataset):
173
        raise TypeError("Training only accepts Dataset object")
Guolin Ke's avatar
Guolin Ke committed
174

175
176
177
178
    train_set._update_params(params) \
             ._set_predictor(predictor) \
             .set_feature_name(feature_name) \
             .set_categorical_feature(categorical_feature)
Guolin Ke's avatar
Guolin Ke committed
179

wxchan's avatar
wxchan committed
180
181
    is_valid_contain_train = False
    train_data_name = "training"
Guolin Ke's avatar
Guolin Ke committed
182
    reduced_valid_sets = []
wxchan's avatar
wxchan committed
183
    name_valid_sets = []
184
    if valid_sets is not None:
Guolin Ke's avatar
Guolin Ke committed
185
186
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
187
        if isinstance(valid_names, str):
wxchan's avatar
wxchan committed
188
            valid_names = [valid_names]
Guolin Ke's avatar
Guolin Ke committed
189
        for i, valid_data in enumerate(valid_sets):
190
            # reduce cost for prediction training data
Guolin Ke's avatar
Guolin Ke committed
191
            if valid_data is train_set:
wxchan's avatar
wxchan committed
192
193
194
195
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
Guolin Ke's avatar
Guolin Ke committed
196
            if not isinstance(valid_data, Dataset):
197
                raise TypeError("Training only accepts Dataset object")
Nikita Titov's avatar
Nikita Titov committed
198
            reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set))
199
            if valid_names is not None and len(valid_names) > i:
wxchan's avatar
wxchan committed
200
201
                name_valid_sets.append(valid_names[i])
            else:
202
                name_valid_sets.append(f'valid_{i}')
203
    # process callbacks
204
    if callbacks is None:
wxchan's avatar
wxchan committed
205
206
207
208
209
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
            cb.__dict__.setdefault('order', i - len(callbacks))
        callbacks = set(callbacks)
wxchan's avatar
wxchan committed
210
211

    # Most of legacy advanced options becomes callbacks
wxchan's avatar
wxchan committed
212
213
    if verbose_eval is True:
        callbacks.add(callback.print_evaluation())
214
    elif isinstance(verbose_eval, int):
wxchan's avatar
wxchan committed
215
        callbacks.add(callback.print_evaluation(verbose_eval))
wxchan's avatar
wxchan committed
216

217
    if early_stopping_rounds is not None and early_stopping_rounds > 0:
218
        callbacks.add(callback.early_stopping(early_stopping_rounds, first_metric_only, verbose=bool(verbose_eval)))
219

wxchan's avatar
wxchan committed
220
    if learning_rates is not None:
221
        callbacks.add(callback.reset_parameter(learning_rate=learning_rates))
wxchan's avatar
wxchan committed
222
223

    if evals_result is not None:
wxchan's avatar
wxchan committed
224
225
226
227
228
229
        callbacks.add(callback.record_evaluation(evals_result))

    callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
    callbacks_after_iter = callbacks - callbacks_before_iter
    callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
    callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
wxchan's avatar
wxchan committed
230

231
    # construct booster
232
233
234
235
    try:
        booster = Booster(params=params, train_set=train_set)
        if is_valid_contain_train:
            booster.set_train_data_name(train_data_name)
236
        for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
237
238
239
240
241
            booster.add_valid(valid_set, name_valid_set)
    finally:
        train_set._reverse_update_params()
        for valid_set in reduced_valid_sets:
            valid_set._reverse_update_params()
242
    booster.best_iteration = 0
wxchan's avatar
wxchan committed
243

244
    # start training
245
    for i in range(init_iteration, init_iteration + num_boost_round):
wxchan's avatar
wxchan committed
246
247
        for cb in callbacks_before_iter:
            cb(callback.CallbackEnv(model=booster,
248
                                    params=params,
wxchan's avatar
wxchan committed
249
                                    iteration=i,
250
251
                                    begin_iteration=init_iteration,
                                    end_iteration=init_iteration + num_boost_round,
wxchan's avatar
wxchan committed
252
253
254
255
256
257
                                    evaluation_result_list=None))

        booster.update(fobj=fobj)

        evaluation_result_list = []
        # check evaluation result.
258
        if valid_sets is not None:
wxchan's avatar
wxchan committed
259
260
261
262
263
264
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
                cb(callback.CallbackEnv(model=booster,
265
                                        params=params,
wxchan's avatar
wxchan committed
266
                                        iteration=i,
267
268
                                        begin_iteration=init_iteration,
                                        end_iteration=init_iteration + num_boost_round,
wxchan's avatar
wxchan committed
269
                                        evaluation_result_list=evaluation_result_list))
270
271
        except callback.EarlyStopException as earlyStopException:
            booster.best_iteration = earlyStopException.best_iteration + 1
wxchan's avatar
wxchan committed
272
            evaluation_result_list = earlyStopException.best_score
wxchan's avatar
wxchan committed
273
            break
274
    booster.best_score = collections.defaultdict(collections.OrderedDict)
wxchan's avatar
wxchan committed
275
276
    for dataset_name, eval_name, score, _ in evaluation_result_list:
        booster.best_score[dataset_name][eval_name] = score
277
    if not keep_training_booster:
Nikita Titov's avatar
Nikita Titov committed
278
        booster.model_from_string(booster.model_to_string(), False).free_dataset()
wxchan's avatar
wxchan committed
279
280
281
    return booster


282
class CVBooster:
283
284
285
286
287
288
289
290
291
292
293
294
295
    """CVBooster in LightGBM.

    Auxiliary data structure to hold and redirect all boosters of ``cv`` function.
    This class has the same methods as Booster class.
    All method calls are actually performed for underlying Boosters and then all returned results are returned in a list.

    Attributes
    ----------
    boosters : list of Booster
        The list of underlying fitted models.
    best_iteration : int
        The best iteration of fitted model.
    """
296

297
    def __init__(self):
298
299
300
301
        """Initialize the CVBooster.

        Generally, no need to instantiate manually.
        """
302
        self.boosters = []
303
        self.best_iteration = -1
304

305
306
    def _append(self, booster):
        """Add a booster to CVBooster."""
307
308
309
        self.boosters.append(booster)

    def __getattr__(self, name):
310
        """Redirect methods call of CVBooster."""
311
312
        def handler_function(*args, **kwargs):
            """Call methods with each booster, and concatenate their results."""
313
314
315
316
            ret = []
            for booster in self.boosters:
                ret.append(getattr(booster, name)(*args, **kwargs))
            return ret
317
        return handler_function
wxchan's avatar
wxchan committed
318

319

320
321
def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratified=True,
                  shuffle=True, eval_train_metric=False):
322
    """Make a n-fold list of Booster from random indices."""
wxchan's avatar
wxchan committed
323
324
    full_data = full_data.construct()
    num_data = full_data.num_data()
325
    if folds is not None:
326
327
328
329
330
331
        if not hasattr(folds, '__iter__') and not hasattr(folds, 'split'):
            raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx) tuples "
                                 "or scikit-learn splitter object with split method")
        if hasattr(folds, 'split'):
            group_info = full_data.get_group()
            if group_info is not None:
332
                group_info = np.array(group_info, dtype=np.int32, copy=False)
333
                flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
334
            else:
335
                flatted_group = np.zeros(num_data, dtype=np.int32)
336
            folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
wxchan's avatar
wxchan committed
337
    else:
338
339
340
        if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg",
                                             "xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
               for obj_alias in _ConfigAliases.get("objective")):
wxchan's avatar
wxchan committed
341
            if not SKLEARN_INSTALLED:
342
                raise LightGBMError('scikit-learn is required for ranking cv')
343
            # ranking task, split according to groups
344
            group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
345
            flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
346
            group_kfold = _LGBMGroupKFold(n_splits=nfold)
wxchan's avatar
wxchan committed
347
348
349
            folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)
        elif stratified:
            if not SKLEARN_INSTALLED:
350
                raise LightGBMError('scikit-learn is required for stratified cv')
351
            skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
wxchan's avatar
wxchan committed
352
            folds = skf.split(X=np.zeros(num_data), y=full_data.get_label())
extremin's avatar
extremin committed
353
        else:
wxchan's avatar
wxchan committed
354
355
356
357
358
            if shuffle:
                randidx = np.random.RandomState(seed).permutation(num_data)
            else:
                randidx = np.arange(num_data)
            kstep = int(num_data / nfold)
359
360
361
            test_id = [randidx[i: i + kstep] for i in range(0, num_data, kstep)]
            train_id = [np.concatenate([test_id[i] for i in range(nfold) if k != i]) for k in range(nfold)]
            folds = zip(train_id, test_id)
wxchan's avatar
wxchan committed
362

363
    ret = CVBooster()
wxchan's avatar
wxchan committed
364
    for train_idx, test_idx in folds:
365
366
        train_set = full_data.subset(sorted(train_idx))
        valid_set = full_data.subset(sorted(test_idx))
wxchan's avatar
wxchan committed
367
368
        # run preprocessing on the data set if needed
        if fpreproc is not None:
wxchan's avatar
wxchan committed
369
            train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
wxchan's avatar
wxchan committed
370
        else:
wxchan's avatar
wxchan committed
371
            tparam = params
372
        cvbooster = Booster(tparam, train_set)
373
374
        if eval_train_metric:
            cvbooster.add_valid(train_set, 'train')
375
        cvbooster.add_valid(valid_set, 'valid')
376
        ret._append(cvbooster)
wxchan's avatar
wxchan committed
377
378
    return ret

wxchan's avatar
wxchan committed
379

380
def _agg_cv_result(raw_results, eval_train_metric=False):
381
    """Aggregate cross-validation results."""
382
    cvmap = collections.OrderedDict()
wxchan's avatar
wxchan committed
383
384
385
    metric_type = {}
    for one_result in raw_results:
        for one_line in one_result:
386
            if eval_train_metric:
387
                key = f"{one_line[0]} {one_line[1]}"
388
389
390
            else:
                key = one_line[1]
            metric_type[key] = one_line[3]
391
            cvmap.setdefault(key, [])
392
            cvmap[key].append(one_line[2])
wxchan's avatar
wxchan committed
393
    return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
wxchan's avatar
wxchan committed
394

wxchan's avatar
wxchan committed
395

396
def cv(params, train_set, num_boost_round=100,
397
       folds=None, nfold=5, stratified=True, shuffle=True,
wxchan's avatar
wxchan committed
398
       metrics=None, fobj=None, feval=None, init_model=None,
399
       feature_name='auto', categorical_feature='auto',
Guolin Ke's avatar
Guolin Ke committed
400
401
       early_stopping_rounds=None, fpreproc=None,
       verbose_eval=None, show_stdv=True, seed=0,
402
403
       callbacks=None, eval_train_metric=False,
       return_cvbooster=False):
Andrew Ziem's avatar
Andrew Ziem committed
404
    """Perform the cross-validation with given parameters.
wxchan's avatar
wxchan committed
405
406
407
408

    Parameters
    ----------
    params : dict
409
        Parameters for Booster.
Guolin Ke's avatar
Guolin Ke committed
410
    train_set : Dataset
411
        Data to be trained on.
412
    num_boost_round : int, optional (default=100)
wxchan's avatar
wxchan committed
413
        Number of boosting iterations.
414
    folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
415
        If generator or iterator, it should yield the train and test indices for each fold.
416
        If object, it should be one of the scikit-learn splitter classes
417
        (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
418
        and have ``split`` method.
419
        This argument has highest priority over other data split arguments.
420
    nfold : int, optional (default=5)
wxchan's avatar
wxchan committed
421
        Number of folds in CV.
422
423
    stratified : bool, optional (default=True)
        Whether to perform stratified sampling.
424
    shuffle : bool, optional (default=True)
425
426
427
428
429
        Whether to shuffle before splitting data.
    metrics : string, list of strings or None, optional (default=None)
        Evaluation metrics to be monitored while CV.
        If not None, the metric in ``params`` will be overridden.
    fobj : callable or None, optional (default=None)
430
431
432
433
434
435
        Customized objective function.
        Should accept two parameters: preds, train_data,
        and return (grad, hess).

            preds : list or numpy 1-D array
                The predicted values.
436
437
                Predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task.
438
439
440
            train_data : Dataset
                The training dataset.
            grad : list or numpy 1-D array
441
442
                The value of the first order derivative (gradient) of the loss
                with respect to the elements of preds for each sample point.
443
            hess : list or numpy 1-D array
444
445
                The value of the second order derivative (Hessian) of the loss
                with respect to the elements of preds for each sample point.
446
447
448
449
450

        For multi-class task, the preds is group by class_id first, then group by row_id.
        If you want to get i-th row preds in j-th class, the access way is score[j * num_data + i]
        and you should group grad and hess in this way as well.

451
    feval : callable, list of callable functions or None, optional (default=None)
452
        Customized evaluation function.
453
        Each evaluation function should accept two parameters: preds, train_data,
454
        and return (eval_name, eval_result, is_higher_better) or list of such tuples.
455
456
457

            preds : list or numpy 1-D array
                The predicted values.
458
459
                If ``fobj`` is specified, predicted values are returned before any transformation,
                e.g. they are raw margin instead of probability of positive class for binary task in this case.
460
461
462
            train_data : Dataset
                The training dataset.
            eval_name : string
Andrew Ziem's avatar
Andrew Ziem committed
463
                The name of evaluation function (without whitespace).
464
465
466
467
468
            eval_result : float
                The eval result.
            is_higher_better : bool
                Is eval result higher better, e.g. AUC is ``is_higher_better``.

469
470
        For multi-class task, the preds is group by class_id first, then group by row_id.
        If you want to get i-th row preds in j-th class, the access way is preds[j * num_data + i].
471
472
        To ignore the default metric corresponding to the used objective,
        set ``metrics`` to the string ``"None"``.
473
    init_model : string, Booster or None, optional (default=None)
474
475
476
477
478
479
480
481
        Filename of LightGBM model or Booster instance used for continue training.
    feature_name : list of strings or 'auto', optional (default="auto")
        Feature names.
        If 'auto' and data is pandas DataFrame, data columns names are used.
    categorical_feature : list of strings or int, or 'auto', optional (default="auto")
        Categorical features.
        If list of int, interpreted as indices.
        If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
482
        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
483
        All values in categorical features should be less than int32 max value (2147483647).
484
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
485
        All negative values in categorical features will be treated as missing values.
486
        The output cannot be monotonically constrained with respect to a categorical feature.
487
    early_stopping_rounds : int or None, optional (default=None)
488
489
490
491
        Activates early stopping.
        CV score needs to improve at least every ``early_stopping_rounds`` round(s)
        to continue.
        Requires at least one metric. If there's more than one, will check all of them.
492
        To check only the first metric, set the ``first_metric_only`` parameter to ``True`` in ``params``.
493
        Last entry in evaluation history is the one from the best iteration.
494
495
    fpreproc : callable or None, optional (default=None)
        Preprocessing function that takes (dtrain, dtest, params)
wxchan's avatar
wxchan committed
496
        and returns transformed versions of those.
497
    verbose_eval : bool, int, or None, optional (default=None)
wxchan's avatar
wxchan committed
498
499
        Whether to display the progress.
        If None, progress will be displayed when np.ndarray is returned.
500
501
502
        If True, progress will be displayed at every boosting stage.
        If int, progress will be displayed at every given ``verbose_eval`` boosting stage.
    show_stdv : bool, optional (default=True)
wxchan's avatar
wxchan committed
503
        Whether to display the standard deviation in progress.
504
        Results are not affected by this parameter, and always contain std.
505
    seed : int, optional (default=0)
wxchan's avatar
wxchan committed
506
        Seed used to generate the folds (passed to numpy.random.seed).
507
    callbacks : list of callables or None, optional (default=None)
508
        List of callback functions that are applied at each iteration.
509
        See Callbacks in Python API for more information.
510
511
512
    eval_train_metric : bool, optional (default=False)
        Whether to display the train metric in progress.
        The score of the metric is calculated again after each training step, so there is some impact on performance.
513
514
    return_cvbooster : bool, optional (default=False)
        Whether to return Booster models trained on each fold through ``CVBooster``.
wxchan's avatar
wxchan committed
515
516
517

    Returns
    -------
518
519
520
521
    eval_hist : dict
        Evaluation history.
        The dictionary has the following format:
        {'metric1-mean': [values], 'metric1-stdv': [values],
Qiwei Ye's avatar
Qiwei Ye committed
522
        'metric2-mean': [values], 'metric2-stdv': [values],
523
        ...}.
524
        If ``return_cvbooster=True``, also returns trained boosters via ``cvbooster`` key.
wxchan's avatar
wxchan committed
525
    """
Guolin Ke's avatar
Guolin Ke committed
526
    if not isinstance(train_set, Dataset):
527
        raise TypeError("Training only accepts Dataset object")
Guolin Ke's avatar
Guolin Ke committed
528

529
    params = copy.deepcopy(params)
530
    if fobj is not None:
531
532
        for obj_alias in _ConfigAliases.get("objective"):
            params.pop(obj_alias, None)
533
        params['objective'] = 'none'
534
    for alias in _ConfigAliases.get("num_iterations"):
535
        if alias in params:
536
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
537
            num_boost_round = params.pop(alias)
538
    params["num_iterations"] = num_boost_round
539
    for alias in _ConfigAliases.get("early_stopping_round"):
540
        if alias in params:
541
            _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
542
            early_stopping_rounds = params.pop(alias)
543
544
    params["early_stopping_round"] = early_stopping_rounds
    first_metric_only = params.get('first_metric_only', False)
545

546
547
    if num_boost_round <= 0:
        raise ValueError("num_boost_round should be greater than zero.")
548
    if isinstance(init_model, str):
549
        predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
Guolin Ke's avatar
Guolin Ke committed
550
    elif isinstance(init_model, Booster):
551
        predictor = init_model._to_predictor(dict(init_model.params, **params))
Guolin Ke's avatar
Guolin Ke committed
552
553
554
    else:
        predictor = None

Peter's avatar
Peter committed
555
    if metrics is not None:
556
557
        for metric_alias in _ConfigAliases.get("metric"):
            params.pop(metric_alias, None)
Peter's avatar
Peter committed
558
        params['metric'] = metrics
wxchan's avatar
wxchan committed
559

560
561
562
563
564
    train_set._update_params(params) \
             ._set_predictor(predictor) \
             .set_feature_name(feature_name) \
             .set_categorical_feature(categorical_feature)

wxchan's avatar
wxchan committed
565
    results = collections.defaultdict(list)
566
567
    cvfolds = _make_n_folds(train_set, folds=folds, nfold=nfold,
                            params=params, seed=seed, fpreproc=fpreproc,
568
569
                            stratified=stratified, shuffle=shuffle,
                            eval_train_metric=eval_train_metric)
wxchan's avatar
wxchan committed
570
571

    # setup callbacks
572
    if callbacks is None:
wxchan's avatar
wxchan committed
573
574
575
576
577
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
            cb.__dict__.setdefault('order', i - len(callbacks))
        callbacks = set(callbacks)
578
    if early_stopping_rounds is not None and early_stopping_rounds > 0:
579
        callbacks.add(callback.early_stopping(early_stopping_rounds, first_metric_only, verbose=False))
wxchan's avatar
wxchan committed
580
581
    if verbose_eval is True:
        callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
582
    elif isinstance(verbose_eval, int):
wxchan's avatar
wxchan committed
583
        callbacks.add(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
wxchan's avatar
wxchan committed
584

wxchan's avatar
wxchan committed
585
586
587
588
    callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
    callbacks_after_iter = callbacks - callbacks_before_iter
    callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
    callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
wxchan's avatar
wxchan committed
589

590
    for i in range(num_boost_round):
wxchan's avatar
wxchan committed
591
        for cb in callbacks_before_iter:
592
593
            cb(callback.CallbackEnv(model=cvfolds,
                                    params=params,
wxchan's avatar
wxchan committed
594
595
596
597
                                    iteration=i,
                                    begin_iteration=0,
                                    end_iteration=num_boost_round,
                                    evaluation_result_list=None))
wxchan's avatar
wxchan committed
598
        cvfolds.update(fobj=fobj)
599
        res = _agg_cv_result(cvfolds.eval_valid(feval), eval_train_metric)
wxchan's avatar
wxchan committed
600
        for _, key, mean, _, std in res:
601
602
            results[f'{key}-mean'].append(mean)
            results[f'{key}-stdv'].append(std)
wxchan's avatar
wxchan committed
603
604
        try:
            for cb in callbacks_after_iter:
605
606
                cb(callback.CallbackEnv(model=cvfolds,
                                        params=params,
wxchan's avatar
wxchan committed
607
608
609
610
                                        iteration=i,
                                        begin_iteration=0,
                                        end_iteration=num_boost_round,
                                        evaluation_result_list=res))
611
612
        except callback.EarlyStopException as earlyStopException:
            cvfolds.best_iteration = earlyStopException.best_iteration + 1
wxchan's avatar
wxchan committed
613
            for k in results:
614
                results[k] = results[k][:cvfolds.best_iteration]
wxchan's avatar
wxchan committed
615
            break
616
617
618
619

    if return_cvbooster:
        results['cvbooster'] = cvfolds

wxchan's avatar
wxchan committed
620
    return dict(results)