Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
3ad9cba0
Commit
3ad9cba0
authored
Oct 23, 2018
by
Nikita Titov
Committed by
Tsukasa OMOTO
Oct 23, 2018
Browse files
[python] refined examples (#1769)
parent
0312ecde
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
58 additions
and
58 deletions
+58
-58
examples/python-guide/README.md
examples/python-guide/README.md
+4
-2
examples/python-guide/advanced_example.py
examples/python-guide/advanced_example.py
+24
-22
examples/python-guide/plot_example.py
examples/python-guide/plot_example.py
+11
-11
examples/python-guide/simple_example.py
examples/python-guide/simple_example.py
+9
-11
examples/python-guide/sklearn_example.py
examples/python-guide/sklearn_example.py
+10
-12
No files found.
examples/python-guide/README.md
View file @
3ad9cba0
...
...
@@ -32,11 +32,13 @@ Examples include:
-
Self-defined eval metric with sklearn interface
-
Find best parameters for the model with sklearn's GridSearchCV
-
[
advanced_example.py
](
https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
)
-
Construct Dataset
-
Set feature names
-
Directly use categorical features without one-hot encoding
-
Dump
model to
json format
-
Get feature importances
-
Save
model to
file
-
Dump model to JSON format
-
Get feature names
-
Get feature importances
-
Load model to predict
-
Dump and load model with pickle
-
Load model file to continue training
...
...
examples/python-guide/advanced_example.py
View file @
3ad9cba0
...
...
@@ -11,17 +11,17 @@ try:
except
BaseException
:
import
pickle
print
(
'Loading data...'
)
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../binary_classification/binary.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../binary_classification/binary.test'
,
header
=
None
,
sep
=
'
\t
'
)
W_train
=
pd
.
read_csv
(
'../binary_classification/binary.train.weight'
,
header
=
None
)[
0
]
W_test
=
pd
.
read_csv
(
'../binary_classification/binary.test.weight'
,
header
=
None
)[
0
]
y_train
=
df_train
[
0
]
.
values
y_test
=
df_test
[
0
]
.
values
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
.
values
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
.
values
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
num_train
,
num_feature
=
X_train
.
shape
...
...
@@ -45,10 +45,10 @@ params = {
'verbose'
:
0
}
# generate
a
feature name
# generate feature name
s
feature_name
=
[
'feature_'
+
str
(
col
)
for
col
in
range
(
num_feature
)]
print
(
'Start training...'
)
print
(
'Start
ing
training...'
)
# feature_name and categorical_feature
gbm
=
lgb
.
train
(
params
,
lgb_train
,
...
...
@@ -57,15 +57,16 @@ gbm = lgb.train(params,
feature_name
=
feature_name
,
categorical_feature
=
[
21
])
print
(
'Finished first 10 rounds...'
)
# check feature name
print
(
'Finish first 10 rounds...'
)
print
(
'7th feature name is:'
,
repr
(
lgb_train
.
feature_name
[
6
]))
print
(
'7th feature name is:'
,
lgb_train
.
feature_name
[
6
])
print
(
'Saving model...'
)
# save model to file
gbm
.
save_model
(
'model.txt'
)
print
(
'Dumping model to JSON...'
)
# dump model to JSON (and save to file)
print
(
'Dump model to JSON...'
)
model_json
=
gbm
.
dump_model
()
with
open
(
'model.json'
,
'w+'
)
as
f
:
...
...
@@ -77,14 +78,15 @@ print('Feature names:', gbm.feature_name())
# feature importances
print
(
'Feature importances:'
,
list
(
gbm
.
feature_importance
()))
print
(
'Loading model to predict...'
)
# load model to predict
print
(
'Load model to predict'
)
bst
=
lgb
.
Booster
(
model_file
=
'model.txt'
)
# can only predict with the best iteration (or the saving iteration)
y_pred
=
bst
.
predict
(
X_test
)
# eval with loaded model
print
(
'
The rmse of loaded model
\
'
s prediction is:
'
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
print
(
"
The rmse of loaded model's prediction is:
"
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
print
(
'Dumping and loading model with pickle...'
)
# dump model with pickle
with
open
(
'model.pkl'
,
'wb'
)
as
fout
:
pickle
.
dump
(
gbm
,
fout
)
...
...
@@ -94,7 +96,7 @@ with open('model.pkl', 'rb') as fin:
# can predict with any iteration when loaded in pickle way
y_pred
=
pkl_bst
.
predict
(
X_test
,
num_iteration
=
7
)
# eval with loaded model
print
(
'
The rmse of pickled model
\
'
s prediction is:
'
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
print
(
"
The rmse of pickled model's prediction is:
"
,
mean_squared_error
(
y_test
,
y_pred
)
**
0.5
)
# continue training
# init_model accepts:
...
...
@@ -106,7 +108,7 @@ gbm = lgb.train(params,
init_model
=
'model.txt'
,
valid_sets
=
lgb_eval
)
print
(
'Finish 10 - 20 rounds with model file...'
)
print
(
'Finish
ed
10 - 20 rounds with model file...'
)
# decay learning rates
# learning_rates accepts:
...
...
@@ -119,7 +121,7 @@ gbm = lgb.train(params,
learning_rates
=
lambda
iter
:
0.05
*
(
0.99
**
iter
),
valid_sets
=
lgb_eval
)
print
(
'Finish 20 - 30 rounds with decay learning rates...'
)
print
(
'Finish
ed
20 - 30 rounds with decay learning rates...'
)
# change other parameters during training
gbm
=
lgb
.
train
(
params
,
...
...
@@ -129,13 +131,13 @@ gbm = lgb.train(params,
valid_sets
=
lgb_eval
,
callbacks
=
[
lgb
.
reset_parameter
(
bagging_fraction
=
[
0.7
]
*
5
+
[
0.6
]
*
5
)])
print
(
'Finish 30 - 40 rounds with changing bagging_fraction...'
)
print
(
'Finish
ed
30 - 40 rounds with changing bagging_fraction...'
)
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def
loglikelood
(
preds
,
train_data
):
def
loglikel
ih
ood
(
preds
,
train_data
):
labels
=
train_data
.
get_label
()
preds
=
1.
/
(
1.
+
np
.
exp
(
-
preds
))
grad
=
preds
-
labels
...
...
@@ -155,13 +157,13 @@ gbm = lgb.train(params,
lgb_train
,
num_boost_round
=
10
,
init_model
=
gbm
,
fobj
=
loglikelood
,
fobj
=
loglikel
ih
ood
,
feval
=
binary_error
,
valid_sets
=
lgb_eval
)
print
(
'Finish 40 - 50 rounds with self-defined objective function and eval metric...'
)
print
(
'Finish
ed
40 - 50 rounds with self-defined objective function and eval metric...'
)
print
(
'Start a new training job...'
)
print
(
'Start
ing
a new training job...'
)
# callback
...
...
@@ -170,7 +172,7 @@ def reset_metrics():
lgb_eval_new
=
lgb
.
Dataset
(
X_test
,
y_test
,
reference
=
lgb_train
)
if
env
.
iteration
-
env
.
begin_iteration
==
5
:
print
(
'Add a new valid dataset at iteration 5...'
)
env
.
model
.
add_valid
(
lgb_eval_new
,
'new
valid'
)
env
.
model
.
add_valid
(
lgb_eval_new
,
'new
_
valid'
)
callback
.
before_iteration
=
True
callback
.
order
=
0
return
callback
...
...
@@ -182,4 +184,4 @@ gbm = lgb.train(params,
valid_sets
=
lgb_train
,
callbacks
=
[
reset_metrics
()])
print
(
'Finish first 10 rounds with callback function...'
)
print
(
'Finish
ed
first 10 rounds with callback function...'
)
examples/python-guide/plot_example.py
View file @
3ad9cba0
...
...
@@ -8,15 +8,15 @@ if lgb.compat.MATPLOTLIB_INSTALLED:
else
:
raise
ImportError
(
'You need to install matplotlib for plot_example.py.'
)
print
(
'Loading data...'
)
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../regression/regression.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../regression/regression.test'
,
header
=
None
,
sep
=
'
\t
'
)
y_train
=
df_train
[
0
]
.
values
y_test
=
df_test
[
0
]
.
values
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
.
values
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
.
values
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
# create dataset for lightgbm
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
...
...
@@ -31,29 +31,29 @@ params = {
evals_result
=
{}
# to record eval results for plotting
print
(
'Start training...'
)
print
(
'Start
ing
training...'
)
# train
gbm
=
lgb
.
train
(
params
,
lgb_train
,
num_boost_round
=
100
,
valid_sets
=
[
lgb_train
,
lgb_test
],
feature_name
=
[
'f'
+
str
(
i
+
1
)
for
i
in
range
(
28
)],
feature_name
=
[
'f'
+
str
(
i
+
1
)
for
i
in
range
(
X_train
.
shape
[
-
1
]
)],
categorical_feature
=
[
21
],
evals_result
=
evals_result
,
verbose_eval
=
10
)
print
(
'Plot metrics recorded during training...'
)
print
(
'Plot
ting
metrics recorded during training...'
)
ax
=
lgb
.
plot_metric
(
evals_result
,
metric
=
'l1'
)
plt
.
show
()
print
(
'Plot feature importances...'
)
print
(
'Plot
ting
feature importances...'
)
ax
=
lgb
.
plot_importance
(
gbm
,
max_num_features
=
10
)
plt
.
show
()
print
(
'Plot 84th tree...'
)
# one tree use categorical feature to split
print
(
'Plot
ting
84th tree...'
)
# one tree use categorical feature to split
ax
=
lgb
.
plot_tree
(
gbm
,
tree_index
=
83
,
figsize
=
(
20
,
8
),
show_info
=
[
'split_gain'
])
plt
.
show
()
print
(
'Plot 84th tree with graphviz...'
)
print
(
'Plot
ting
84th tree with graphviz...'
)
graph
=
lgb
.
create_tree_digraph
(
gbm
,
tree_index
=
83
,
name
=
'Tree84'
)
graph
.
render
(
view
=
True
)
examples/python-guide/simple_example.py
View file @
3ad9cba0
...
...
@@ -4,16 +4,15 @@ import lightgbm as lgb
import
pandas
as
pd
from
sklearn.metrics
import
mean_squared_error
print
(
'Loading data...'
)
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../regression/regression.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../regression/regression.test'
,
header
=
None
,
sep
=
'
\t
'
)
y_train
=
df_train
[
0
]
.
values
y_test
=
df_test
[
0
]
.
values
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
.
values
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
.
values
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
# create dataset for lightgbm
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
...
...
@@ -21,10 +20,9 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params
=
{
'task'
:
'train'
,
'boosting_type'
:
'gbdt'
,
'objective'
:
'regression'
,
'metric'
:
{
'l2'
,
'
auc
'
},
'metric'
:
{
'l2'
,
'
l1
'
},
'num_leaves'
:
31
,
'learning_rate'
:
0.05
,
'feature_fraction'
:
0.9
,
...
...
@@ -33,7 +31,7 @@ params = {
'verbose'
:
0
}
print
(
'Start training...'
)
print
(
'Start
ing
training...'
)
# train
gbm
=
lgb
.
train
(
params
,
lgb_train
,
...
...
@@ -41,11 +39,11 @@ gbm = lgb.train(params,
valid_sets
=
lgb_eval
,
early_stopping_rounds
=
5
)
print
(
'Sav
e
model...'
)
print
(
'Sav
ing
model...'
)
# save model to file
gbm
.
save_model
(
'model.txt'
)
print
(
'Start predicting...'
)
print
(
'Start
ing
predicting...'
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration
)
# eval
...
...
examples/python-guide/sklearn_example.py
View file @
3ad9cba0
...
...
@@ -7,20 +7,19 @@ import lightgbm as lgb
from
sklearn.metrics
import
mean_squared_error
from
sklearn.model_selection
import
GridSearchCV
print
(
'Loading data...'
)
# load or create your dataset
print
(
'Load data...'
)
df_train
=
pd
.
read_csv
(
'../regression/regression.train'
,
header
=
None
,
sep
=
'
\t
'
)
df_test
=
pd
.
read_csv
(
'../regression/regression.test'
,
header
=
None
,
sep
=
'
\t
'
)
y_train
=
df_train
[
0
]
.
values
y_test
=
df_test
[
0
]
.
values
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
.
values
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
.
values
y_train
=
df_train
[
0
]
y_test
=
df_test
[
0
]
X_train
=
df_train
.
drop
(
0
,
axis
=
1
)
X_test
=
df_test
.
drop
(
0
,
axis
=
1
)
print
(
'Start training...'
)
print
(
'Start
ing
training...'
)
# train
gbm
=
lgb
.
LGBMRegressor
(
objective
=
'regression'
,
num_leaves
=
31
,
gbm
=
lgb
.
LGBMRegressor
(
num_leaves
=
31
,
learning_rate
=
0.05
,
n_estimators
=
20
)
gbm
.
fit
(
X_train
,
y_train
,
...
...
@@ -28,7 +27,7 @@ gbm.fit(X_train, y_train,
eval_metric
=
'l1'
,
early_stopping_rounds
=
5
)
print
(
'Start predicting...'
)
print
(
'Start
ing
predicting...'
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration_
)
# eval
...
...
@@ -45,14 +44,14 @@ def rmsle(y_true, y_pred):
return
'RMSLE'
,
np
.
sqrt
(
np
.
mean
(
np
.
power
(
np
.
log1p
(
y_pred
)
-
np
.
log1p
(
y_true
),
2
))),
False
print
(
'Start training with custom eval function...'
)
print
(
'Start
ing
training with custom eval function...'
)
# train
gbm
.
fit
(
X_train
,
y_train
,
eval_set
=
[(
X_test
,
y_test
)],
eval_metric
=
rmsle
,
early_stopping_rounds
=
5
)
print
(
'Start predicting...'
)
print
(
'Start
ing
predicting...'
)
# predict
y_pred
=
gbm
.
predict
(
X_test
,
num_iteration
=
gbm
.
best_iteration_
)
# eval
...
...
@@ -67,7 +66,6 @@ param_grid = {
}
gbm
=
GridSearchCV
(
estimator
,
param_grid
,
cv
=
3
)
gbm
.
fit
(
X_train
,
y_train
)
print
(
'Best parameters found by grid search are:'
,
gbm
.
best_params_
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment