engine.py 15.9 KB
Newer Older
wxchan's avatar
wxchan committed
1
2
3
4
5
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import

wxchan's avatar
wxchan committed
6
7
import collections
from operator import attrgetter
wxchan's avatar
wxchan committed
8
import numpy as np
Guolin Ke's avatar
Guolin Ke committed
9
from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str
wxchan's avatar
wxchan committed
10
11
from . import callback

Guolin Ke's avatar
Guolin Ke committed
12
13
def train(params, train_set, num_boost_round=100,
          valid_sets=None, valid_names=None,
wxchan's avatar
wxchan committed
14
          fobj=None, feval=None, init_model=None,
Guolin Ke's avatar
Guolin Ke committed
15
          feature_name=None, categorical_feature=None,
wxchan's avatar
wxchan committed
16
17
18
19
20
21
22
          early_stopping_rounds=None, evals_result=None,
          verbose_eval=True, learning_rates=None, callbacks=None):
    """Train with given parameters.

    Parameters
    ----------
    params : dict
23
        Parameters for training.
Guolin Ke's avatar
Guolin Ke committed
24
    train_set : Dataset
wxchan's avatar
wxchan committed
25
26
27
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
Guolin Ke's avatar
Guolin Ke committed
28
    valid_sets: list of Datasets
wxchan's avatar
wxchan committed
29
30
        List of data to be evaluated during training
    valid_names: list of string
Guolin Ke's avatar
Guolin Ke committed
31
        Names of valid_sets
wxchan's avatar
wxchan committed
32
33
34
35
36
37
38
    fobj : function
        Customized objective function.
    feval : function
        Customized evaluation function.
        Note: should return (eval_name, eval_result, is_higher_better) of list of this
    init_model : file name of lightgbm model or 'Booster' instance
        model used for continued train
Guolin Ke's avatar
Guolin Ke committed
39
    feature_name : list of str
40
41
42
43
        Feature names
    categorical_feature : list of str or int
        Categorical features, type int represents index, \
        type str represents feature names (need to specify feature_name as well)
wxchan's avatar
wxchan committed
44
45
46
47
48
49
50
    early_stopping_rounds: int
        Activates early stopping.
        Requires at least one validation data and one metric
        If there's more than one, will check all of them
        Returns the model with (best_iter + early_stopping_rounds)
        If early stopping occurs, the model will add 'best_iteration' field
    evals_result: dict or None
Guolin Ke's avatar
Guolin Ke committed
51
52
        This dictionary used to store all evaluation results of all the items in valid_sets.
        Example: with a valid_sets containing [valid_set, train_set] \
wxchan's avatar
wxchan committed
53
54
55
56
57
58
        and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
        passed with None means no using this function
    verbose_eval : bool or int
        Requires at least one item in evals.
59
        If `verbose_eval` is True then the evaluation metric on the validation set is \
wxchan's avatar
wxchan committed
60
        printed at each boosting stage.
61
62
        If `verbose_eval` is an integer then the evaluation metric on the validation set \
        is printed at every given `verbose_eval` boosting stage. The last boosting stage \
wxchan's avatar
wxchan committed
63
        / the boosting stage found by using `early_stopping_rounds` is also printed.
64
        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
wxchan's avatar
wxchan committed
65
66
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function
67
68
69
70
        List of learning rate for each boosting round \
        or a customized function that calculates learning_rate in terms of \
        current number of round and the total number of boosting round \
        (e.g. yields learning rate decay)
wxchan's avatar
wxchan committed
71
        - list l: learning_rate = l[current_round]
wxchan's avatar
wxchan committed
72
73
        - function f: learning_rate = f(current_round, total_boost_round) \
        or learning_rate = f(current_round)
wxchan's avatar
wxchan committed
74
75
76
77
78
79
80
81
82
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.

    Returns
    -------
    booster : a trained booster model
    """
    """create predictor first"""
    if is_str(init_model):
Guolin Ke's avatar
Guolin Ke committed
83
        predictor = _InnerPredictor(model_file=init_model)
wxchan's avatar
wxchan committed
84
    elif isinstance(init_model, Booster):
Guolin Ke's avatar
Guolin Ke committed
85
        predictor = init_model._to_predictor()
wxchan's avatar
wxchan committed
86
87
    else:
        predictor = None
88
    init_iteration = predictor.num_total_iteration if predictor is not None else 0
Guolin Ke's avatar
Guolin Ke committed
89
90
    """check dataset"""
    if not isinstance(train_set, Dataset):
91
        raise TypeError("Traninig only accepts Dataset object")
Guolin Ke's avatar
Guolin Ke committed
92
93
94
95
96

    train_set._set_predictor(predictor)
    train_set.set_feature_name(feature_name)
    train_set.set_categorical_feature(categorical_feature)

wxchan's avatar
wxchan committed
97
98
    is_valid_contain_train = False
    train_data_name = "training"
Guolin Ke's avatar
Guolin Ke committed
99
    reduced_valid_sets = []
wxchan's avatar
wxchan committed
100
    name_valid_sets = []
101
    if valid_sets is not None:
Guolin Ke's avatar
Guolin Ke committed
102
103
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
wxchan's avatar
wxchan committed
104
105
        if isinstance(valid_names, str):
            valid_names = [valid_names]
Guolin Ke's avatar
Guolin Ke committed
106
        for i, valid_data in enumerate(valid_sets):
wxchan's avatar
wxchan committed
107
            """reduce cost for prediction training data"""
Guolin Ke's avatar
Guolin Ke committed
108
            if valid_data is train_set:
wxchan's avatar
wxchan committed
109
110
111
112
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
Guolin Ke's avatar
Guolin Ke committed
113
            if not isinstance(valid_data, Dataset):
114
                raise TypeError("Traninig only accepts Dataset object")
Guolin Ke's avatar
Guolin Ke committed
115
116
            valid_data.set_reference(train_set)
            reduced_valid_sets.append(valid_data)
117
            if valid_names is not None and len(valid_names) > i:
wxchan's avatar
wxchan committed
118
119
120
                name_valid_sets.append(valid_names[i])
            else:
                name_valid_sets.append('valid_'+str(i))
wxchan's avatar
wxchan committed
121

wxchan's avatar
wxchan committed
122
    """process callbacks"""
123
    if callbacks is None:
wxchan's avatar
wxchan committed
124
125
126
127
128
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
            cb.__dict__.setdefault('order', i - len(callbacks))
        callbacks = set(callbacks)
wxchan's avatar
wxchan committed
129
130

    # Most of legacy advanced options becomes callbacks
wxchan's avatar
wxchan committed
131
132
    if verbose_eval is True:
        callbacks.add(callback.print_evaluation())
133
    elif isinstance(verbose_eval, int):
wxchan's avatar
wxchan committed
134
        callbacks.add(callback.print_evaluation(verbose_eval))
wxchan's avatar
wxchan committed
135

136
    if early_stopping_rounds is not None:
wxchan's avatar
wxchan committed
137
138
        callbacks.add(callback.early_stop(early_stopping_rounds,
                                          verbose=bool(verbose_eval)))
139

wxchan's avatar
wxchan committed
140
    if learning_rates is not None:
wxchan's avatar
wxchan committed
141
        callbacks.add(callback.reset_learning_rate(learning_rates))
wxchan's avatar
wxchan committed
142
143

    if evals_result is not None:
wxchan's avatar
wxchan committed
144
145
146
147
148
149
        callbacks.add(callback.record_evaluation(evals_result))

    callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
    callbacks_after_iter = callbacks - callbacks_before_iter
    callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
    callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
wxchan's avatar
wxchan committed
150
151
152
153
154

    """construct booster"""
    booster = Booster(params=params, train_set=train_set)
    if is_valid_contain_train:
        booster.set_train_data_name(train_data_name)
Guolin Ke's avatar
Guolin Ke committed
155
    for valid_set, name_valid_set in zip(reduced_valid_sets, name_valid_sets):
wxchan's avatar
wxchan committed
156
        booster.add_valid(valid_set, name_valid_set)
wxchan's avatar
wxchan committed
157

wxchan's avatar
wxchan committed
158
    """start training"""
159
    for i in range(init_iteration, init_iteration + num_boost_round):
wxchan's avatar
wxchan committed
160
161
162
163
        for cb in callbacks_before_iter:
            cb(callback.CallbackEnv(model=booster,
                                    cvfolds=None,
                                    iteration=i,
164
165
                                    begin_iteration=init_iteration,
                                    end_iteration=init_iteration + num_boost_round,
wxchan's avatar
wxchan committed
166
167
168
169
170
171
                                    evaluation_result_list=None))

        booster.update(fobj=fobj)

        evaluation_result_list = []
        # check evaluation result.
172
        if valid_sets is not None:
wxchan's avatar
wxchan committed
173
174
175
176
177
178
179
180
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
                cb(callback.CallbackEnv(model=booster,
                                        cvfolds=None,
                                        iteration=i,
181
182
                                        begin_iteration=init_iteration,
                                        end_iteration=init_iteration + num_boost_round,
wxchan's avatar
wxchan committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
                                        evaluation_result_list=evaluation_result_list))
        except callback.EarlyStopException:
            break
    if booster.attr('best_iteration') is not None:
        booster.best_iteration = int(booster.attr('best_iteration')) + 1
    else:
        booster.best_iteration = num_boost_round
    return booster


class CVBooster(object):
    """"Auxiliary datastruct to hold one fold of CV."""
    def __init__(self, train_set, valid_test, params):
        """"Initialize the CVBooster"""
        self.train_set = train_set
        self.valid_test = valid_test
        self.booster = Booster(params=params, train_set=train_set)
        self.booster.add_valid(valid_test, 'valid')

    def update(self, fobj):
        """"Update the boosters for one iteration"""
        self.booster.update(fobj=fobj)

    def eval(self, feval):
        """"Evaluate the CVBooster for one iteration."""
        return self.booster.eval_valid(feval)

try:
211
    from sklearn.model_selection import StratifiedKFold
wxchan's avatar
wxchan committed
212
213
    SKLEARN_StratifiedKFold = True
except ImportError:
214
215
216
217
218
    try:
        from sklearn.cross_validation import StratifiedKFold
        SKLEARN_StratifiedKFold = True
    except ImportError:
        SKLEARN_StratifiedKFold = False
wxchan's avatar
wxchan committed
219

wxchan's avatar
wxchan committed
220
def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=False):
wxchan's avatar
wxchan committed
221
222
223
224
225
226
227
228
229
    """
    Make an n-fold list of CVBooster from random indices.
    """
    np.random.seed(seed)
    if stratified:
        if SKLEARN_StratifiedKFold:
            sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
            idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
        else:
230
            raise LightGBMError('Scikit-learn is required for stratified cv')
wxchan's avatar
wxchan committed
231
    else:
Guolin Ke's avatar
Guolin Ke committed
232
        full_data.construct()
wxchan's avatar
wxchan committed
233
234
235
236
237
238
239
240
241
242
        randidx = np.random.permutation(full_data.num_data())
        kstep = int(len(randidx) / nfold)
        idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]

    ret = []
    for k in range(nfold):
        train_set = full_data.subset(np.concatenate([idset[i] for i in range(nfold) if k != i]))
        valid_set = full_data.subset(idset[k])
        # run preprocessing on the data set if needed
        if fpreproc is not None:
wxchan's avatar
wxchan committed
243
            train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
wxchan's avatar
wxchan committed
244
        else:
wxchan's avatar
wxchan committed
245
            tparam = params
wxchan's avatar
wxchan committed
246
247
248
249
250
251
252
        ret.append(CVBooster(train_set, valid_set, tparam))
    return ret

def _agg_cv_result(raw_results):
    """
    Aggregate cross-validation results.
    """
wxchan's avatar
wxchan committed
253
    cvmap = collections.defaultdict(list)
wxchan's avatar
wxchan committed
254
255
256
    metric_type = {}
    for one_result in raw_results:
        for one_line in one_result:
wxchan's avatar
wxchan committed
257
258
259
            metric_type[one_line[1]] = one_line[3]
            cvmap[one_line[1]].append(one_line[2])
    return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
wxchan's avatar
wxchan committed
260

Guolin Ke's avatar
Guolin Ke committed
261
262
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
       metrics=(), fobj=None, feval=None, init_model=None,
Guolin Ke's avatar
Guolin Ke committed
263
264
265
       feature_name=None, categorical_feature=None,
       early_stopping_rounds=None, fpreproc=None,
       verbose_eval=None, show_stdv=True, seed=0,
wxchan's avatar
wxchan committed
266
267
268
269
270
271
272
       callbacks=None):
    """Cross-validation with given paramaters.

    Parameters
    ----------
    params : dict
        Booster params.
Guolin Ke's avatar
Guolin Ke committed
273
    train_set : Dataset
wxchan's avatar
wxchan committed
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
        Data to be trained.
    num_boost_round : int
        Number of boosting iterations.
    nfold : int
        Number of folds in CV.
    stratified : bool
        Perform stratified sampling.
    folds : a KFold or StratifiedKFold instance
        Sklearn KFolds or StratifiedKFolds.
    metrics : string or list of strings
        Evaluation metrics to be watched in CV.
    fobj : function
        Custom objective function.
    feval : function
        Custom evaluation function.
Guolin Ke's avatar
Guolin Ke committed
289
290
    init_model : file name of lightgbm model or 'Booster' instance
        model used for continued train
Guolin Ke's avatar
Guolin Ke committed
291
    feature_name : list of str
292
293
294
295
        Feature names
    categorical_feature : list of str or int
        Categorical features, type int represents index, \
        type str represents feature names (need to specify feature_name as well)
wxchan's avatar
wxchan committed
296
    early_stopping_rounds: int
297
        Activates early stopping. CV error needs to decrease at least \
wxchan's avatar
wxchan committed
298
299
300
        every <early_stopping_rounds> round(s) to continue.
        Last entry in evaluation history is the one from best iteration.
    fpreproc : function
301
        Preprocessing function that takes (dtrain, dtest, param) and returns \
wxchan's avatar
wxchan committed
302
303
        transformed versions of those.
    verbose_eval : bool, int, or None, default None
304
305
306
        Whether to display the progress. If None, progress will be displayed \
        when np.ndarray is returned. If True, progress will be displayed at \
        boosting stage. If an integer is given, progress will be displayed \
wxchan's avatar
wxchan committed
307
308
309
310
311
312
313
314
315
316
317
318
319
        at every given `verbose_eval` boosting stage.
    show_stdv : bool, default True
        Whether to display the standard deviation in progress.
        Results are not affected, and always contains std.
    seed : int
        Seed used to generate the folds (passed to numpy.random.seed).
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.

    Returns
    -------
    evaluation history : list(string)
    """
Guolin Ke's avatar
Guolin Ke committed
320
    if not isinstance(train_set, Dataset):
321
        raise TypeError("Traninig only accepts Dataset object")
Guolin Ke's avatar
Guolin Ke committed
322
323
324
325
326
327
328
329
330
331
332
333

    if is_str(init_model):
        predictor = _InnerPredictor(model_file=init_model)
    elif isinstance(init_model, Booster):
        predictor = init_model._to_predictor()
    else:
        predictor = None

    train_set._set_predictor(predictor)
    train_set.set_feature_name(feature_name)
    train_set.set_categorical_feature(categorical_feature)

334
335
336
337
    if metrics:
        params.setdefault('metric', [])
        if is_str(metrics):
            params['metric'].append(metrics)
wxchan's avatar
wxchan committed
338
        else:
339
            params['metric'].extend(metrics)
wxchan's avatar
wxchan committed
340

wxchan's avatar
wxchan committed
341
    results = collections.defaultdict(list)
wxchan's avatar
wxchan committed
342
343
344
    cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)

    # setup callbacks
345
    if callbacks is None:
wxchan's avatar
wxchan committed
346
347
348
349
350
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
            cb.__dict__.setdefault('order', i - len(callbacks))
        callbacks = set(callbacks)
351
    if early_stopping_rounds is not None:
wxchan's avatar
wxchan committed
352
353
354
        callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False))
    if verbose_eval is True:
        callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
355
    elif isinstance(verbose_eval, int):
wxchan's avatar
wxchan committed
356
        callbacks.add(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
wxchan's avatar
wxchan committed
357

wxchan's avatar
wxchan committed
358
359
360
361
    callbacks_before_iter = {cb for cb in callbacks if getattr(cb, 'before_iteration', False)}
    callbacks_after_iter = callbacks - callbacks_before_iter
    callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
    callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
wxchan's avatar
wxchan committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375

    for i in range(num_boost_round):
        for cb in callbacks_before_iter:
            cb(callback.CallbackEnv(model=None,
                                    cvfolds=cvfolds,
                                    iteration=i,
                                    begin_iteration=0,
                                    end_iteration=num_boost_round,
                                    evaluation_result_list=None))
        for fold in cvfolds:
            fold.update(fobj)
        res = _agg_cv_result([f.eval(feval) for f in cvfolds])
        for _, key, mean, _, std in res:
            results[key + '-mean'].append(mean)
wxchan's avatar
wxchan committed
376
            results[key + '-stdv'].append(std)
wxchan's avatar
wxchan committed
377
378
379
380
381
382
383
384
385
386
        try:
            for cb in callbacks_after_iter:
                cb(callback.CallbackEnv(model=None,
                                        cvfolds=cvfolds,
                                        iteration=i,
                                        begin_iteration=0,
                                        end_iteration=num_boost_round,
                                        evaluation_result_list=res))
        except callback.EarlyStopException as e:
            for k in results:
wxchan's avatar
wxchan committed
387
                results[k] = results[k][:e.best_iteration + 1]
wxchan's avatar
wxchan committed
388
            break
wxchan's avatar
wxchan committed
389
    return dict(results)