Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
4a9b08ea
Unverified
Commit
4a9b08ea
authored
Aug 16, 2022
by
nyanp
Committed by
GitHub
Aug 15, 2022
Browse files
[python-package] support saving and loading CVBooster (fixes #3556) (#5160)
parent
6b695c29
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
204 additions
and
23 deletions
+204
-23
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+119
-4
tests/python_package_test/test_callback.py
tests/python_package_test/test_callback.py
+1
-17
tests/python_package_test/test_engine.py
tests/python_package_test/test_engine.py
+66
-2
tests/python_package_test/utils.py
tests/python_package_test/utils.py
+18
-0
No files found.
python-package/lightgbm/engine.py
View file @
4a9b08ea
...
...
@@ -2,6 +2,7 @@
"""Library with training routines of LightGBM."""
import
collections
import
copy
import
json
from
operator
import
attrgetter
from
pathlib
import
Path
from
typing
import
Any
,
Callable
,
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
...
...
@@ -271,9 +272,14 @@ def train(
class
CVBooster
:
"""CVBooster in LightGBM.
Auxiliary data structure to hold and redirect all boosters of ``cv`` function.
Auxiliary data structure to hold and redirect all boosters of ``cv
()
`` function.
This class has the same methods as Booster class.
All method calls are actually performed for underlying Boosters and then all returned results are returned in a list.
All method calls, except for the following methods, are actually performed for underlying Boosters and
then all returned results are returned in a list.
- ``model_from_string()``
- ``model_to_string()``
- ``save_model()``
Attributes
----------
...
...
@@ -283,18 +289,43 @@ class CVBooster:
The best iteration of fitted model.
"""
def
__init__
(
self
):
def
__init__
(
self
,
model_file
:
Optional
[
Union
[
str
,
Path
]]
=
None
):
"""Initialize the CVBooster.
Generally, no need to instantiate manually.
Parameters
----------
model_file : str, pathlib.Path or None, optional (default=None)
Path to the CVBooster model file.
"""
self
.
boosters
=
[]
self
.
best_iteration
=
-
1
if
model_file
is
not
None
:
with
open
(
model_file
,
"r"
)
as
file
:
self
.
_from_dict
(
json
.
load
(
file
))
def
_append
(
self
,
booster
:
Booster
)
->
None
:
"""Add a booster to CVBooster."""
self
.
boosters
.
append
(
booster
)
def
_from_dict
(
self
,
models
:
Dict
[
str
,
Any
])
->
None
:
"""Load CVBooster from dict."""
self
.
best_iteration
=
models
[
"best_iteration"
]
self
.
boosters
=
[]
for
model_str
in
models
[
"boosters"
]:
self
.
_append
(
Booster
(
model_str
=
model_str
))
def
_to_dict
(
self
,
num_iteration
:
Optional
[
int
],
start_iteration
:
int
,
importance_type
:
str
)
->
Dict
[
str
,
Any
]:
"""Serialize CVBooster to dict."""
models_str
=
[]
for
booster
in
self
.
boosters
:
models_str
.
append
(
booster
.
model_to_string
(
num_iteration
=
num_iteration
,
start_iteration
=
start_iteration
,
importance_type
=
importance_type
))
return
{
"boosters"
:
models_str
,
"best_iteration"
:
self
.
best_iteration
}
def
__getattr__
(
self
,
name
:
str
)
->
Callable
[[
Any
,
Any
],
List
[
Any
]]:
"""Redirect methods call of CVBooster."""
def
handler_function
(
*
args
:
Any
,
**
kwargs
:
Any
)
->
List
[
Any
]:
...
...
@@ -305,6 +336,90 @@ class CVBooster:
return
ret
return
handler_function
def
__getstate__
(
self
)
->
Dict
[
str
,
Any
]:
return
vars
(
self
)
def
__setstate__
(
self
,
state
:
Dict
[
str
,
Any
])
->
None
:
vars
(
self
).
update
(
state
)
def
model_from_string
(
self
,
model_str
:
str
)
->
"CVBooster"
:
"""Load CVBooster from a string.
Parameters
----------
model_str : str
Model will be loaded from this string.
Returns
-------
self : CVBooster
Loaded CVBooster object.
"""
self
.
_from_dict
(
json
.
loads
(
model_str
))
return
self
def
model_to_string
(
self
,
num_iteration
:
Optional
[
int
]
=
None
,
start_iteration
:
int
=
0
,
importance_type
:
str
=
'split'
)
->
str
:
"""Save CVBooster to JSON string.
Parameters
----------
num_iteration : int or None, optional (default=None)
Index of the iteration that should be saved.
If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
If <= 0, all iterations are saved.
start_iteration : int, optional (default=0)
Start index of the iteration that should be saved.
importance_type : str, optional (default="split")
What type of feature importance should be saved.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
str_repr : str
JSON string representation of CVBooster.
"""
return
json
.
dumps
(
self
.
_to_dict
(
num_iteration
,
start_iteration
,
importance_type
))
def
save_model
(
self
,
filename
:
Union
[
str
,
Path
],
num_iteration
:
Optional
[
int
]
=
None
,
start_iteration
:
int
=
0
,
importance_type
:
str
=
'split'
)
->
"CVBooster"
:
"""Save CVBooster to a file as JSON text.
Parameters
----------
filename : str or pathlib.Path
Filename to save CVBooster.
num_iteration : int or None, optional (default=None)
Index of the iteration that should be saved.
If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
If <= 0, all iterations are saved.
start_iteration : int, optional (default=0)
Start index of the iteration that should be saved.
importance_type : str, optional (default="split")
What type of feature importance should be saved.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
self : CVBooster
Returns self.
"""
with
open
(
filename
,
"w"
)
as
file
:
json
.
dump
(
self
.
_to_dict
(
num_iteration
,
start_iteration
,
importance_type
),
file
)
return
self
def
_make_n_folds
(
full_data
:
Dataset
,
...
...
tests/python_package_test/test_callback.py
View file @
4a9b08ea
...
...
@@ -3,23 +3,7 @@ import pytest
import
lightgbm
as
lgb
from
.utils
import
pickle_obj
,
unpickle_obj
SERIALIZERS
=
[
"pickle"
,
"joblib"
,
"cloudpickle"
]
def
pickle_and_unpickle_object
(
obj
,
serializer
):
with
lgb
.
basic
.
_TempFile
()
as
tmp_file
:
pickle_obj
(
obj
=
obj
,
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
obj_from_disk
=
unpickle_obj
(
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
return
obj_from_disk
from
.utils
import
SERIALIZERS
,
pickle_and_unpickle_object
,
pickle_obj
,
unpickle_obj
def
reset_feature_fraction
(
boosting_round
):
...
...
tests/python_package_test/test_engine.py
View file @
4a9b08ea
...
...
@@ -20,8 +20,9 @@ from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_spli
import
lightgbm
as
lgb
from
lightgbm.compat
import
PANDAS_INSTALLED
,
pd_DataFrame
from
.utils
import
(
dummy_obj
,
load_boston
,
load_breast_cancer
,
load_digits
,
load_iris
,
logistic_sigmoid
,
make_synthetic_regression
,
mse_obj
,
sklearn_multiclass_custom_objective
,
softmax
)
from
.utils
import
(
SERIALIZERS
,
dummy_obj
,
load_boston
,
load_breast_cancer
,
load_digits
,
load_iris
,
logistic_sigmoid
,
make_synthetic_regression
,
mse_obj
,
pickle_and_unpickle_object
,
sklearn_multiclass_custom_objective
,
softmax
)
decreasing_generator
=
itertools
.
count
(
0
,
-
1
)
...
...
@@ -1073,6 +1074,69 @@ def test_cvbooster():
assert
ret
<
0.15
def
test_cvbooster_save_load
(
tmp_path
):
X
,
y
=
load_breast_cancer
(
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
_
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
params
=
{
'objective'
:
'binary'
,
'metric'
:
'binary_logloss'
,
'verbose'
:
-
1
,
}
nfold
=
3
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
cv_res
=
lgb
.
cv
(
params
,
lgb_train
,
num_boost_round
=
10
,
nfold
=
nfold
,
callbacks
=
[
lgb
.
early_stopping
(
stopping_rounds
=
5
)],
return_cvbooster
=
True
)
cvbooster
=
cv_res
[
'cvbooster'
]
preds
=
cvbooster
.
predict
(
X_test
)
best_iteration
=
cvbooster
.
best_iteration
model_path_txt
=
str
(
tmp_path
/
'lgb.model'
)
cvbooster
.
save_model
(
model_path_txt
)
model_string
=
cvbooster
.
model_to_string
()
del
cvbooster
cvbooster_from_txt_file
=
lgb
.
CVBooster
(
model_file
=
model_path_txt
)
cvbooster_from_string
=
lgb
.
CVBooster
().
model_from_string
(
model_string
)
for
cvbooster_loaded
in
[
cvbooster_from_txt_file
,
cvbooster_from_string
]:
assert
best_iteration
==
cvbooster_loaded
.
best_iteration
np
.
testing
.
assert_array_equal
(
preds
,
cvbooster_loaded
.
predict
(
X_test
))
@
pytest
.
mark
.
parametrize
(
'serializer'
,
SERIALIZERS
)
def
test_cvbooster_picklable
(
serializer
):
X
,
y
=
load_breast_cancer
(
return_X_y
=
True
)
X_train
,
X_test
,
y_train
,
_
=
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
42
)
params
=
{
'objective'
:
'binary'
,
'metric'
:
'binary_logloss'
,
'verbose'
:
-
1
,
}
nfold
=
3
lgb_train
=
lgb
.
Dataset
(
X_train
,
y_train
)
cv_res
=
lgb
.
cv
(
params
,
lgb_train
,
num_boost_round
=
10
,
nfold
=
nfold
,
callbacks
=
[
lgb
.
early_stopping
(
stopping_rounds
=
5
)],
return_cvbooster
=
True
)
cvbooster
=
cv_res
[
'cvbooster'
]
preds
=
cvbooster
.
predict
(
X_test
)
best_iteration
=
cvbooster
.
best_iteration
cvbooster_from_disk
=
pickle_and_unpickle_object
(
obj
=
cvbooster
,
serializer
=
serializer
)
del
cvbooster
assert
best_iteration
==
cvbooster_from_disk
.
best_iteration
preds_from_disk
=
cvbooster_from_disk
.
predict
(
X_test
)
np
.
testing
.
assert_array_equal
(
preds
,
preds_from_disk
)
def
test_feature_name
():
X_train
,
y_train
=
make_synthetic_regression
()
params
=
{
'verbose'
:
-
1
}
...
...
tests/python_package_test/utils.py
View file @
4a9b08ea
...
...
@@ -8,6 +8,10 @@ import numpy as np
import
sklearn.datasets
from
sklearn.utils
import
check_random_state
import
lightgbm
as
lgb
SERIALIZERS
=
[
"pickle"
,
"joblib"
,
"cloudpickle"
]
@
lru_cache
(
maxsize
=
None
)
def
load_boston
(
**
kwargs
):
...
...
@@ -179,3 +183,17 @@ def unpickle_obj(filepath, serializer):
return
cloudpickle
.
load
(
f
)
else
:
raise
ValueError
(
f
'Unrecognized serializer type:
{
serializer
}
'
)
def
pickle_and_unpickle_object
(
obj
,
serializer
):
with
lgb
.
basic
.
_TempFile
()
as
tmp_file
:
pickle_obj
(
obj
=
obj
,
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
obj_from_disk
=
unpickle_obj
(
filepath
=
tmp_file
.
name
,
serializer
=
serializer
)
return
obj_from_disk
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment