# coding: utf-8 # pylint: disable = invalid-name, C0111 import lightgbm as lgb import pandas as pd import numpy as np # load or create your dataset print('Load data...') df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t') df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t') W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0] W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0] y_train = df_train[0] y_test = df_test[0] X_train = df_train.drop(0, axis=1) X_test = df_test.drop(0, axis=1) num_train, num_feature = X_train.shape # create dataset for lightgbm # if you want to re-use data, remember to set free_raw_data=False lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False) # specify your configurations as a dict params = { 'boosting_type' : 'gbdt', 'objective' : 'binary', 'metric' : 'binary_logloss', 'num_leaves' : 31, 'learning_rate' : 0.05, 'feature_fraction' : 0.9, 'bagging_fraction' : 0.8, 'bagging_freq': 5, 'verbose' : 0 } # generate a feature name feature_name = ['feature_' + str(col) for col in range(num_feature)] print('Start training...') # feature_name and categorical_feature gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, # eval training data feature_name=feature_name, categorical_feature=[21]) # check feature name print('Finish first 10 rounds...') print('7th feature name is:', repr(lgb_train.feature_name[6])) # save model to file gbm.save_model('model.txt') # continue training # init_model accepts: # 1. model file name # 2. Booster() gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model='model.txt', valid_sets=lgb_eval) print('Finish 10 - 20 rounds with model file...') # decay learning rates # learning_rates accepts: # 1. list/tuple with length = num_boost_round # 2. function(curr_iter) gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, learning_rates=lambda iter: 0.05 * (0.99 ** iter), valid_sets=lgb_eval) print('Finish 20 - 30 rounds with decay learning rates...') # change other parameters during training gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, valid_sets=lgb_eval, callbacks=[lgb.reset_parameter(bagging_fraction=[0.7]*5+[0.6]*5)]) print('Finish 30 - 40 rounds with changing bagging_fraction...') # self-defined objective function # f(preds: array, train_data: Dataset) -> grad: array, hess: array # log likelihood loss def loglikelood(preds, train_data): labels = train_data.get_label() preds = 1. / (1. + np.exp(-preds)) grad = preds - labels hess = preds * (1. - preds) return grad, hess # self-defined eval metric # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool # binary error def binary_error(preds, train_data): labels = train_data.get_label() return 'error', np.mean(labels != (preds > 0.5)), False gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model=gbm, fobj=loglikelood, feval=binary_error, valid_sets=lgb_eval) print('Finish 40 - 50 rounds with self-defined objective function and eval metric...') print('Start a new training job...') # callback def reset_metrics(): def callback(env): lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train) if env.iteration - env.begin_iteration == 5: print('Add a new valid dataset at iteration 5...') env.model.add_valid(lgb_eval_new, 'new valid') callback.before_iteration = True callback.order = 0 return callback gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()]) print('Finish first 10 rounds with callback function...')