Unverified Commit 4a9b08ea authored by nyanp's avatar nyanp Committed by GitHub
Browse files

[python-package] support saving and loading CVBooster (fixes #3556) (#5160)

parent 6b695c29
......@@ -2,6 +2,7 @@
"""Library with training routines of LightGBM."""
import collections
import copy
import json
from operator import attrgetter
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
......@@ -271,9 +272,14 @@ def train(
class CVBooster:
"""CVBooster in LightGBM.
Auxiliary data structure to hold and redirect all boosters of ``cv`` function.
Auxiliary data structure to hold and redirect all boosters of ``cv()`` function.
This class has the same methods as Booster class.
All method calls are actually performed for underlying Boosters and then all returned results are returned in a list.
All method calls, except for the following methods, are actually performed for underlying Boosters and
then all returned results are returned in a list.
- ``model_from_string()``
- ``model_to_string()``
- ``save_model()``
Attributes
----------
......@@ -283,18 +289,43 @@ class CVBooster:
The best iteration of fitted model.
"""
def __init__(self):
def __init__(
self,
model_file: Optional[Union[str, Path]] = None
):
"""Initialize the CVBooster.
Generally, no need to instantiate manually.
Parameters
----------
model_file : str, pathlib.Path or None, optional (default=None)
Path to the CVBooster model file.
"""
self.boosters = []
self.best_iteration = -1
if model_file is not None:
with open(model_file, "r") as file:
self._from_dict(json.load(file))
def _append(self, booster: Booster) -> None:
"""Add a booster to CVBooster."""
self.boosters.append(booster)
def _from_dict(self, models: Dict[str, Any]) -> None:
"""Load CVBooster from dict."""
self.best_iteration = models["best_iteration"]
self.boosters = []
for model_str in models["boosters"]:
self._append(Booster(model_str=model_str))
def _to_dict(self, num_iteration: Optional[int], start_iteration: int, importance_type: str) -> Dict[str, Any]:
"""Serialize CVBooster to dict."""
models_str = []
for booster in self.boosters:
models_str.append(booster.model_to_string(num_iteration=num_iteration, start_iteration=start_iteration,
importance_type=importance_type))
return {"boosters": models_str, "best_iteration": self.best_iteration}
def __getattr__(self, name: str) -> Callable[[Any, Any], List[Any]]:
"""Redirect methods call of CVBooster."""
def handler_function(*args: Any, **kwargs: Any) -> List[Any]:
......@@ -305,6 +336,90 @@ class CVBooster:
return ret
return handler_function
def __getstate__(self) -> Dict[str, Any]:
return vars(self)
def __setstate__(self, state: Dict[str, Any]) -> None:
vars(self).update(state)
def model_from_string(self, model_str: str) -> "CVBooster":
"""Load CVBooster from a string.
Parameters
----------
model_str : str
Model will be loaded from this string.
Returns
-------
self : CVBooster
Loaded CVBooster object.
"""
self._from_dict(json.loads(model_str))
return self
def model_to_string(
self,
num_iteration: Optional[int] = None,
start_iteration: int = 0,
importance_type: str = 'split'
) -> str:
"""Save CVBooster to JSON string.
Parameters
----------
num_iteration : int or None, optional (default=None)
Index of the iteration that should be saved.
If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
If <= 0, all iterations are saved.
start_iteration : int, optional (default=0)
Start index of the iteration that should be saved.
importance_type : str, optional (default="split")
What type of feature importance should be saved.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
str_repr : str
JSON string representation of CVBooster.
"""
return json.dumps(self._to_dict(num_iteration, start_iteration, importance_type))
def save_model(
self,
filename: Union[str, Path],
num_iteration: Optional[int] = None,
start_iteration: int = 0,
importance_type: str = 'split'
) -> "CVBooster":
"""Save CVBooster to a file as JSON text.
Parameters
----------
filename : str or pathlib.Path
Filename to save CVBooster.
num_iteration : int or None, optional (default=None)
Index of the iteration that should be saved.
If None, if the best iteration exists, it is saved; otherwise, all iterations are saved.
If <= 0, all iterations are saved.
start_iteration : int, optional (default=0)
Start index of the iteration that should be saved.
importance_type : str, optional (default="split")
What type of feature importance should be saved.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
self : CVBooster
Returns self.
"""
with open(filename, "w") as file:
json.dump(self._to_dict(num_iteration, start_iteration, importance_type), file)
return self
def _make_n_folds(
full_data: Dataset,
......
......@@ -3,23 +3,7 @@ import pytest
import lightgbm as lgb
from .utils import pickle_obj, unpickle_obj
SERIALIZERS = ["pickle", "joblib", "cloudpickle"]
def pickle_and_unpickle_object(obj, serializer):
with lgb.basic._TempFile() as tmp_file:
pickle_obj(
obj=obj,
filepath=tmp_file.name,
serializer=serializer
)
obj_from_disk = unpickle_obj(
filepath=tmp_file.name,
serializer=serializer
)
return obj_from_disk
from .utils import SERIALIZERS, pickle_and_unpickle_object, pickle_obj, unpickle_obj
def reset_feature_fraction(boosting_round):
......
......@@ -20,8 +20,9 @@ from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_spli
import lightgbm as lgb
from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
from .utils import (dummy_obj, load_boston, load_breast_cancer, load_digits, load_iris, logistic_sigmoid,
make_synthetic_regression, mse_obj, sklearn_multiclass_custom_objective, softmax)
from .utils import (SERIALIZERS, dummy_obj, load_boston, load_breast_cancer, load_digits, load_iris, logistic_sigmoid,
make_synthetic_regression, mse_obj, pickle_and_unpickle_object, sklearn_multiclass_custom_objective,
softmax)
decreasing_generator = itertools.count(0, -1)
......@@ -1073,6 +1074,69 @@ def test_cvbooster():
assert ret < 0.15
def test_cvbooster_save_load(tmp_path):
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1,
}
nfold = 3
lgb_train = lgb.Dataset(X_train, y_train)
cv_res = lgb.cv(params, lgb_train,
num_boost_round=10,
nfold=nfold,
callbacks=[lgb.early_stopping(stopping_rounds=5)],
return_cvbooster=True)
cvbooster = cv_res['cvbooster']
preds = cvbooster.predict(X_test)
best_iteration = cvbooster.best_iteration
model_path_txt = str(tmp_path / 'lgb.model')
cvbooster.save_model(model_path_txt)
model_string = cvbooster.model_to_string()
del cvbooster
cvbooster_from_txt_file = lgb.CVBooster(model_file=model_path_txt)
cvbooster_from_string = lgb.CVBooster().model_from_string(model_string)
for cvbooster_loaded in [cvbooster_from_txt_file, cvbooster_from_string]:
assert best_iteration == cvbooster_loaded.best_iteration
np.testing.assert_array_equal(preds, cvbooster_loaded.predict(X_test))
@pytest.mark.parametrize('serializer', SERIALIZERS)
def test_cvbooster_picklable(serializer):
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1,
}
nfold = 3
lgb_train = lgb.Dataset(X_train, y_train)
cv_res = lgb.cv(params, lgb_train,
num_boost_round=10,
nfold=nfold,
callbacks=[lgb.early_stopping(stopping_rounds=5)],
return_cvbooster=True)
cvbooster = cv_res['cvbooster']
preds = cvbooster.predict(X_test)
best_iteration = cvbooster.best_iteration
cvbooster_from_disk = pickle_and_unpickle_object(obj=cvbooster, serializer=serializer)
del cvbooster
assert best_iteration == cvbooster_from_disk.best_iteration
preds_from_disk = cvbooster_from_disk.predict(X_test)
np.testing.assert_array_equal(preds, preds_from_disk)
def test_feature_name():
X_train, y_train = make_synthetic_regression()
params = {'verbose': -1}
......
......@@ -8,6 +8,10 @@ import numpy as np
import sklearn.datasets
from sklearn.utils import check_random_state
import lightgbm as lgb
SERIALIZERS = ["pickle", "joblib", "cloudpickle"]
@lru_cache(maxsize=None)
def load_boston(**kwargs):
......@@ -179,3 +183,17 @@ def unpickle_obj(filepath, serializer):
return cloudpickle.load(f)
else:
raise ValueError(f'Unrecognized serializer type: {serializer}')
def pickle_and_unpickle_object(obj, serializer):
with lgb.basic._TempFile() as tmp_file:
pickle_obj(
obj=obj,
filepath=tmp_file.name,
serializer=serializer
)
obj_from_disk = unpickle_obj(
filepath=tmp_file.name,
serializer=serializer
)
return obj_from_disk
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment