"src/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "bbeecc09af946c5ff9b84d1ada4749a9f26bca31"
advanced_example.py 6.8 KB
Newer Older
1
# coding: utf-8
2
import copy
3
import json
4
import pickle
5
from pathlib import Path
6

7
import numpy as np
8
import pandas as pd
9
from sklearn.metrics import roc_auc_score
10

11
import lightgbm as lgb
12

13
print("Loading data...")
14
# load or create your dataset
15
16
17
18
19
binary_example_dir = Path(__file__).absolute().parents[1] / "binary_classification"
df_train = pd.read_csv(str(binary_example_dir / "binary.train"), header=None, sep="\t")
df_test = pd.read_csv(str(binary_example_dir / "binary.test"), header=None, sep="\t")
W_train = pd.read_csv(str(binary_example_dir / "binary.train.weight"), header=None)[0]
W_test = pd.read_csv(str(binary_example_dir / "binary.test.weight"), header=None)[0]
20

21
22
23
24
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
25
26
27
28
29

num_train, num_feature = X_train.shape

# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
30
31
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)
32
33
34

# specify your configurations as a dict
params = {
35
36
37
38
39
40
41
42
43
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "binary_logloss",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0,
44
45
}

46
# generate feature names
47
feature_name = [f"feature_{col}" for col in range(num_feature)]
48

49
print("Starting training...")
50
# feature_name and categorical_feature
51
52
53
54
55
56
57
58
59
60
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=10,
    valid_sets=lgb_train,  # eval training data
    feature_name=feature_name,
    categorical_feature=[21],
)

print("Finished first 10 rounds...")
61
# check feature name
62
print(f"7th feature name is: {lgb_train.feature_name[6]}")
63

64
print("Saving model...")
65
# save model to file
66
gbm.save_model("model.txt")
67

68
print("Dumping model to JSON...")
Nikita Titov's avatar
Nikita Titov committed
69
# dump model to JSON (and save to file)
70
71
model_json = gbm.dump_model()

72
with open("model.json", "w+") as f:
73
74
75
    json.dump(model_json, f, indent=4)

# feature names
76
print(f"Feature names: {gbm.feature_name()}")
77
78

# feature importances
79
print(f"Feature importances: {list(gbm.feature_importance())}")
80

81
print("Loading model to predict...")
82
# load model to predict
83
bst = lgb.Booster(model_file="model.txt")
84
85
86
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
87
88
auc_loaded_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of loaded model's prediction is: {auc_loaded_model}")
89

90
print("Dumping and loading model with pickle...")
91
# dump model with pickle
92
with open("model.pkl", "wb") as fout:
93
94
    pickle.dump(gbm, fout)
# load model with pickle to predict
95
with open("model.pkl", "rb") as fin:
96
97
98
99
    pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
100
101
auc_pickled_model = roc_auc_score(y_test, y_pred)
print(f"The ROC AUC of pickled model's prediction is: {auc_pickled_model}")
102

103
104
105
106
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
107
gbm = lgb.train(params, lgb_train, num_boost_round=10, init_model="model.txt", valid_sets=lgb_eval)
108

109
print("Finished 10 - 20 rounds with model file...")
110
111

# decay learning rates
112
113
# reset_parameter callback accepts:
# 1. list with length = num_boost_round
114
# 2. function(curr_iter)
115
116
117
118
119
120
121
122
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=10,
    init_model=gbm,
    valid_sets=lgb_eval,
    callbacks=[lgb.reset_parameter(learning_rate=lambda iter: 0.05 * (0.99**iter))],
)
123

124
print("Finished 20 - 30 rounds with decay learning rates...")
125

wxchan's avatar
wxchan committed
126
# change other parameters during training
127
128
129
130
131
132
133
134
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=10,
    init_model=gbm,
    valid_sets=lgb_eval,
    callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)],
)
wxchan's avatar
wxchan committed
135

136
print("Finished 30 - 40 rounds with changing bagging_fraction...")
wxchan's avatar
wxchan committed
137

wxchan's avatar
wxchan committed
138

139
140
141
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
142
def loglikelihood(preds, train_data):
143
    labels = train_data.get_label()
144
    preds = 1.0 / (1.0 + np.exp(-preds))
145
    grad = preds - labels
146
    hess = preds * (1.0 - preds)
147
148
    return grad, hess

wxchan's avatar
wxchan committed
149

150
# self-defined eval metric
151
# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
152
# binary error
153
# NOTE: when you do customized loss function, the default prediction value is margin
Andrew Ziem's avatar
Andrew Ziem committed
154
# This may make built-in evaluation metric calculate wrong results
155
156
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
157
158
def binary_error(preds, train_data):
    labels = train_data.get_label()
159
160
    preds = 1.0 / (1.0 + np.exp(-preds))
    return "error", np.mean(labels != (preds > 0.5)), False
161

wxchan's avatar
wxchan committed
162

163
164
# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
165
params_custom_obj["objective"] = loglikelihood
166

167
168
169
gbm = lgb.train(
    params_custom_obj, lgb_train, num_boost_round=10, init_model=gbm, feval=binary_error, valid_sets=lgb_eval
)
170

171
print("Finished 40 - 50 rounds with self-defined objective function and eval metric...")
172

173
174

# another self-defined eval metric
175
# f(preds: array, train_data: Dataset) -> name: str, eval_result: float, is_higher_better: bool
176
# accuracy
177
# NOTE: when you do customized loss function, the default prediction value is margin
Andrew Ziem's avatar
Andrew Ziem committed
178
# This may make built-in evaluation metric calculate wrong results
179
180
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
181
182
def accuracy(preds, train_data):
    labels = train_data.get_label()
183
184
    preds = 1.0 / (1.0 + np.exp(-preds))
    return "accuracy", np.mean(labels == (preds > 0.5)), True
185
186


187
188
# Pass custom objective function through params
params_custom_obj = copy.deepcopy(params)
189
params_custom_obj["objective"] = loglikelihood
190

191
192
193
194
195
196
197
198
gbm = lgb.train(
    params_custom_obj,
    lgb_train,
    num_boost_round=10,
    init_model=gbm,
    feval=[binary_error, accuracy],
    valid_sets=lgb_eval,
)
199

200
print("Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...")
201

202
print("Starting a new training job...")
wxchan's avatar
wxchan committed
203
204


205
206
207
208
209
# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
210
211
212
            print("Add a new valid dataset at iteration 5...")
            env.model.add_valid(lgb_eval_new, "new_valid")

213
214
215
216
    callback.before_iteration = True
    callback.order = 0
    return callback

wxchan's avatar
wxchan committed
217

218
gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_train, callbacks=[reset_metrics()])
219

220
print("Finished first 10 rounds with callback function...")