"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "336a77dfdecd68a528bc9b13a47e6ff93cca0ff6"
advanced_example.py 7.03 KB
Newer Older
1
# coding: utf-8
2
import json
3
import pickle
4
from pathlib import Path
5

6
import numpy as np
7
import pandas as pd
8
9
from sklearn.metrics import mean_squared_error

10
import lightgbm as lgb
11

12
print('Loading data...')
13
# load or create your dataset
14
15
16
17
18
binary_example_dir = Path(__file__).absolute().parents[1] / 'binary_classification'
df_train = pd.read_csv(str(binary_example_dir / 'binary.train'), header=None, sep='\t')
df_test = pd.read_csv(str(binary_example_dir / 'binary.test'), header=None, sep='\t')
W_train = pd.read_csv(str(binary_example_dir / 'binary.train.weight'), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / 'binary.test.weight'), header=None)[0]
19

20
21
22
23
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
24
25
26
27
28
29
30
31
32
33
34
35

num_train, num_feature = X_train.shape

# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

# specify your configurations as a dict
params = {
wxchan's avatar
wxchan committed
36
37
38
39
40
41
42
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
43
    'bagging_freq': 5,
wxchan's avatar
wxchan committed
44
    'verbose': 0
45
46
}

47
# generate feature names
48
feature_name = [f'feature_{col}' for col in range(num_feature)]
49

50
print('Starting training...')
51
# feature_name and categorical_feature
52
53
54
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
wxchan's avatar
wxchan committed
55
                valid_sets=lgb_train,  # eval training data
56
57
                feature_name=feature_name,
                categorical_feature=[21])
58

59
print('Finished first 10 rounds...')
60
# check feature name
61
print(f'7th feature name is: {lgb_train.feature_name[6]}')
62

63
print('Saving model...')
64
65
66
# save model to file
gbm.save_model('model.txt')

67
print('Dumping model to JSON...')
Nikita Titov's avatar
Nikita Titov committed
68
# dump model to JSON (and save to file)
69
70
71
72
73
74
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

# feature names
75
print(f'Feature names: {gbm.feature_name()}')
76
77

# feature importances
78
print(f'Feature importances: {list(gbm.feature_importance())}')
79

80
print('Loading model to predict...')
81
82
83
84
85
# load model to predict
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
86
87
rmse_loaded_model = mean_squared_error(y_test, y_pred) ** 0.5
print(f"The RMSE of loaded model's prediction is: {rmse_loaded_model}")
88

89
print('Dumping and loading model with pickle...')
90
91
92
93
94
95
96
97
98
# dump model with pickle
with open('model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
99
100
rmse_pickled_model = mean_squared_error(y_test, y_pred) ** 0.5
print(f"The RMSE of pickled model's prediction is: {rmse_pickled_model}")
101

102
103
104
105
106
107
108
109
110
111
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',
                valid_sets=lgb_eval)

112
print('Finished 10 - 20 rounds with model file...')
113
114

# decay learning rates
115
116
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
117
118
119
120
121
# 2. function(curr_iter)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
122
123
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99 ** iter))])
124

125
print('Finished 20 - 30 rounds with decay learning rates...')
126

wxchan's avatar
wxchan committed
127
128
129
130
131
132
# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
wxchan's avatar
wxchan committed
133
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
wxchan's avatar
wxchan committed
134

135
print('Finished 30 - 40 rounds with changing bagging_fraction...')
wxchan's avatar
wxchan committed
136

wxchan's avatar
wxchan committed
137

138
139
140
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
141
def loglikelihood(preds, train_data):
142
143
144
145
146
147
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess

wxchan's avatar
wxchan committed
148

149
# self-defined eval metric
150
# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
151
# binary error
152
# NOTE: when you do customized loss function, the default prediction value is margin
Andrew Ziem's avatar
Andrew Ziem committed
153
# This may make built-in evaluation metric calculate wrong results
154
155
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
156
157
def binary_error(preds, train_data):
    labels = train_data.get_label()
158
    preds = 1. / (1. + np.exp(-preds))
159
160
    return 'error', np.mean(labels != (preds > 0.5)), False

wxchan's avatar
wxchan committed
161

162
163
164
165
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
166
                fobj=loglikelihood,
167
168
169
                feval=binary_error,
                valid_sets=lgb_eval)

170
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
171

172
173

# another self-defined eval metric
174
# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
175
# accuracy
176
# NOTE: when you do customized loss function, the default prediction value is margin
Andrew Ziem's avatar
Andrew Ziem committed
177
# This may make built-in evaluation metric calculate wrong results
178
179
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
180
181
def accuracy(preds, train_data):
    labels = train_data.get_label()
182
    preds = 1. / (1. + np.exp(-preds))
183
184
185
186
187
188
189
190
    return 'accuracy', np.mean(labels == (preds > 0.5)), True


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
191
                feval=[binary_error, accuracy],
192
193
                valid_sets=lgb_eval)

194
print('Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...')
195

196
print('Starting a new training job...')
wxchan's avatar
wxchan committed
197
198


199
200
201
202
203
204
# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
205
            env.model.add_valid(lgb_eval_new, 'new_valid')
206
207
208
209
    callback.before_iteration = True
    callback.order = 0
    return callback

wxchan's avatar
wxchan committed
210

211
212
213
214
215
216
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

217
print('Finished first 10 rounds with callback function...')