engine.py

"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import

import collections
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback


def _construct_dataset(x, y, reference=None,
    params=None, other_fields=None, predictor=None):
    if 'max_bin' in params:
        max_bin = int(params['max_bin'])
    else:
        max_bin = 255
    weight = None
    group = None
    init_score = None
    if other_fields is not None:
        if not is isinstance(other_fields, dict):
            raise TypeError("other filed data should be dict type")
        weight = None if 'weight' not in other_fields else other_fields['weight']
        group = None if 'group' not in other_fields else other_fields['group']
        init_score = None if 'init_score' not in other_fields else other_fields['init_score']
    if reference is None:
        ret = Dataset(x, y, max_bin=max_bin, 
            weight=weight, group=group, predictor=predictor, params=params)
    else:
        ret = reference.create_valid(x, y, weight, group, params=params)
    if init_score is not None:
        ret.set_init_score(init_score)
    return ret

def train(params, train_data, num_boost_round=100, 
        valid_datas=None, valid_names=None,
        fobj=None, feval=None, init_model=None, 
        train_fields=None, valid_fields=None, 
        early_stopping_rounds=None, out_eval_result=None,
        verbose_eval=True, learning_rates=None, callbacks=None):
    """Train with given parameters.

    Parameters
    ----------
    params : dict
         params.
    train_data : pair, (X, y)
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
    valid_datas: list of pairs (valid_X, valid_y)
        List of data to be evaluated during training
    valid_names: list of string
        names of valid_datas
    fobj : function
        Customized objective function.
    feval : function
        Customized evaluation function.
        Note: should return (eval_name, eval_result, is_higher_better) of list of this
    init_model : file name of lightgbm model or 'Booster' instance
        model used for continued train
    train_fields : dict
        other data file in training data. e.g. train_fields['weight'] is weight data
        support fields: weight, group, init_score
    valid_fields : dict
        other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
        support fields: weight, group, init_score
    early_stopping_rounds: int
        Activates early stopping. 
        Requires at least one validation data and one metric
        If there's more than one, will check all of them
        Returns the model with (best_iter + early_stopping_rounds)
        If early stopping occurs, the model will add 'best_iteration' field
    out_eval_result: dict or None
        This dictionary used to store all evaluation results of all the items in valid_datas.
        Example: with a valid_datas containing [dtest, dtrain] and valid_names containing ['eval', 'train'] and
        a paramater containing ('metric':'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
        passed with None means no using this function
    verbose_eval : bool or int
        Requires at least one item in evals.
        If `verbose_eval` is True then the evaluation metric on the validation set is
        printed at each boosting stage.
        If `verbose_eval` is an integer then the evaluation metric on the validation set
        is printed at every given `verbose_eval` boosting stage. The last boosting stage
        / the boosting stage found by using `early_stopping_rounds` is also printed.
        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function
        List of learning rate for each boosting round
        or a customized function that calculates learning_rate in terms of
        current number of round and the total number of boosting round (e.g. yields
        learning rate decay)
        - list l: learning_rate = l[current_round]
        - function f: learning_rate = f(current_round, total_boost_round)
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.

    Returns
    -------
    booster : a trained booster model
    """
    """create predictor first"""
    if is_str(init_model):
        predictor = Predictor(model_file=init_model)
    elif isinstance(init_model, Booster):
        predictor = Booster.to_predictor()
    elif isinstance(init_model, Predictor):
        predictor = init_model
    else:
        predictor = None
    """create dataset"""
    train_set = _construct_dataset(train_data[0], train_data[1], None, params, train_fields, predictor, silent)
    is_valid_contain_train = False
    train_data_name = "training"
    valid_sets = []
    name_valid_sets = []
    if valid_datas is not None:
        for i in range(len(valid_datas)):
            other_fields = None if valid_fields is None else valid_fields[i]
            """reduce cost for prediction training data"""
            if valid_datas[i] is train_data:
                is_valid_contain_train = True
                train_data_name = valid_names[i]
                continue
            valid_set = _construct_dataset(
                valid_datas[i][0], 
                valid_datas[i][1],
                train_set, 
                params, 
                other_fields, 
                predictor,
                silent)
            valid_sets.append(valid_set)
            name_valid_sets.append(valid_names[i])
    """process callbacks"""
    callbacks = [] if callbacks is None else callbacks

    # Most of legacy advanced options becomes callbacks
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation())
    else:
        if isinstance(verbose_eval, int):
            callbacks.append(callback.print_evaluation(verbose_eval))

    if early_stopping_rounds is not None:
        callbacks.append(callback.early_stop(early_stopping_rounds,
                                             verbose=bool(verbose_eval)))
    if learning_rates is not None:
        callbacks.append(callback.reset_learning_rate(learning_rates))

    if evals_result is not None:
        callbacks.append(callback.record_evaluation(evals_result))

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
    callbacks_after_iter = [
        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
    """construct booster"""
    booster = Booster(params=params, train_set=train_set, silent=silent)
    if is_valid_contain_train:
        booster.set_train_data_name(train_data_name)
    for i in range(len(valid_sets)):
        booster.add_valid(valid_sets[i], name_valid_sets[i])
    """start training"""
    for i in range(num_boost_round):
        for cb in callbacks_before_iter:
            cb(CallbackEnv(model=booster,
                           cvfolds=None,
                           iteration=i,
                           begin_iteration=0,
                           end_iteration=num_boost_round,
                           evaluation_result_list=None))

        booster.update(fobj=fobj)

        evaluation_result_list = []
        # check evaluation result.
        if len(valid_sets) != 0:
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
        try:
            for cb in callbacks_after_iter:
                cb(CallbackEnv(model=booster,
                               cvfolds=None,
                               iteration=i,
                               begin_iteration=0,
                               end_iteration=num_boost_round,
                               evaluation_result_list=evaluation_result_list))
        except EarlyStopException:
            break
    if booster.attr('best_iteration') is not None:
        booster.best_iteration = int(booster.attr('best_iteration'))
    else:
        booster.best_iteration = num_boost_round - 1
    return num_boost_round