Commit a034ceeb authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

support pickle (#151)

* support pickle

* add pickle/joblib test; change test_basic to unittest

* remove file for deepcopy

* fix tests

* test basic predict from file

* Revert "test basic predict from file"

This reverts commit 60d2c3158537fd56081f60f1d6d120cedd782887.

* test predict from file

* use tempfile for copy & pickle

* use tempfile w/o binary mode

* clean test
parent 1d7acf57
...@@ -7,8 +7,8 @@ from __future__ import absolute_import ...@@ -7,8 +7,8 @@ from __future__ import absolute_import
import sys import sys
import ctypes import ctypes
import tempfile
import json import json
from tempfile import NamedTemporaryFile
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
...@@ -276,16 +276,15 @@ class _InnerPredictor(object): ...@@ -276,16 +276,15 @@ class _InnerPredictor(object):
if num_iteration > self.num_total_iteration: if num_iteration > self.num_total_iteration:
num_iteration = self.num_total_iteration num_iteration = self.num_total_iteration
if is_str(data): if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name with NamedTemporaryFile(mode='w+') as f:
_safe_call(_LIB.LGBM_BoosterPredictForFile( _safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle, self.handle,
c_str(data), c_str(data),
int_data_has_header, int_data_has_header,
predict_type, predict_type,
num_iteration, num_iteration,
c_str(tmp_pred_fname))) c_str(f.name)))
with open(tmp_pred_fname, "r") as tmp_file: lines = f.readlines()
lines = tmp_file.readlines()
nrow = len(lines) nrow = len(lines)
preds = [float(token) for line in lines for token in line.split('\t')] preds = [float(token) for line in lines for token in line.split('\t')]
preds = np.array(preds, dtype=np.float64, copy=False) preds = np.array(preds, dtype=np.float64, copy=False)
...@@ -1333,6 +1332,40 @@ class Booster(object): ...@@ -1333,6 +1332,40 @@ class Booster(object):
if self.handle is not None: if self.handle is not None:
_safe_call(_LIB.LGBM_BoosterFree(self.handle)) _safe_call(_LIB.LGBM_BoosterFree(self.handle))
def __copy__(self):
return self.__deepcopy__(None)
def __deepcopy__(self, _):
with NamedTemporaryFile(mode='w+') as f:
self.save_model(f.name)
return Booster(model_file=f.name)
def __getstate__(self):
this = self.__dict__.copy()
handle = this['handle']
this.pop('train_set', None)
this.pop('valid_sets', None)
if handle is not None:
with NamedTemporaryFile(mode='w+') as f:
self.save_model(f.name)
this["handle"] = f.readlines()
return this
def __setstate__(self, state):
model = state['handle']
if model is not None:
handle = ctypes.c_void_p()
out_num_iterations = ctypes.c_int64(0)
with NamedTemporaryFile(mode='w+') as f:
f.writelines(model)
f.flush()
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(f.name),
ctypes.byref(out_num_iterations),
ctypes.byref(handle)))
state['handle'] = handle
self.__dict__.update(state)
def set_train_data_name(self, name): def set_train_data_name(self, name):
self.__train_data_name = name self.__train_data_name = name
......
# coding: utf-8 # coding: utf-8
# pylint: skip-file
import unittest, tempfile
import numpy as np import numpy as np
from sklearn import datasets, metrics, model_selection from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import lightgbm as lgb import lightgbm as lgb
X, Y = datasets.make_classification(n_samples=100000, n_features=100) class TestBasic(unittest.TestCase):
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)
train_data = lgb.Dataset(x_train, max_bin=255, label=y_train) def test(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1)
valid_data = train_data.create_valid(x_test, label=y_test) train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
valid_data = train_data.create_valid(X_test, label=y_test)
config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15} params = {
bst = lgb.Booster(params=config, train_set=train_data) "objective" : "binary",
bst.add_valid(valid_data,"valid_1") "metric" : "auc",
"min_data" : 1,
"num_leaves" : 15,
"verbose" : -1
}
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1")
for i in range(100): for i in range(30):
bst.update() bst.update()
if i % 10 == 0: if i % 10 == 0:
print(bst.eval_train()) print(bst.eval_train(), bst.eval_valid())
print(bst.eval_valid()) bst.save_model("model.txt")
bst.save_model("model.txt") pred_from_matr = bst.predict(X_test)
with tempfile.NamedTemporaryFile() as f:
np.savetxt(f, X_test, delimiter=',')
f.flush()
pred_from_file = bst.predict(f.name)
self.assertEqual(len(pred_from_matr), len(pred_from_file))
for preds in zip(pred_from_matr, pred_from_file):
self.assertAlmostEqual(*preds, places=5)
print("----------------------------------------------------------------------")
print("running test_basic.py")
unittest.main()
# coding: utf-8 # coding: utf-8
# pylint: skip-file # pylint: skip-file
import os, unittest, math import os, unittest, math, copy
import numpy as np import numpy as np
import lightgbm as lgb import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
try:
import cPickle as pickle
except:
import pickle
def multi_logloss(y_true, y_pred): def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params = {'objective' : 'regression', 'metric' : 'l2'}, def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
X_y=load_boston(True), feval=mean_squared_error, X_y=load_boston(True), feval=mean_squared_error,
stratify=None, num_round=100, return_data=False, num_round=100, init_model=None, custom_eval=None,
return_model=False, init_model=None, custom_eval=None): return_data=False, return_model=False):
X, y = X_y X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, lgb_train = lgb.Dataset(X_train, y_train, params=params)
stratify=stratify, lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=not return_model, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=not return_model, params=params)
if return_data: return lgb_train, lgb_eval if return_data: return lgb_train, lgb_eval
evals_result = {} evals_result = {}
params['verbose'] = 0 params['verbose'] = params['seed'] = 0
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train,
num_boost_round=num_round, num_boost_round=num_round,
valid_sets=lgb_eval, valid_sets=lgb_eval,
...@@ -35,7 +36,7 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'}, ...@@ -35,7 +36,7 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
if return_model: return gbm if return_model: return gbm
else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration)) else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestBasic(unittest.TestCase): class TestEngine(unittest.TestCase):
def test_binary(self): def test_binary(self):
X_y= load_breast_cancer(True) X_y= load_breast_cancer(True)
...@@ -43,7 +44,7 @@ class TestBasic(unittest.TestCase): ...@@ -43,7 +44,7 @@ class TestBasic(unittest.TestCase):
'objective' : 'binary', 'objective' : 'binary',
'metric' : 'binary_logloss' 'metric' : 'binary_logloss'
} }
evals_result, ret = test_template(params, X_y, log_loss, stratify=X_y[1]) evals_result, ret = test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
self.assertAlmostEqual(min(evals_result['eval']['logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['logloss']), ret, places=5)
...@@ -60,7 +61,7 @@ class TestBasic(unittest.TestCase): ...@@ -60,7 +61,7 @@ class TestBasic(unittest.TestCase):
'metric' : 'multi_logloss', 'metric' : 'multi_logloss',
'num_class' : 10 'num_class' : 10
} }
evals_result, ret = test_template(params, X_y, multi_logloss, stratify=X_y[1]) evals_result, ret = test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
...@@ -90,17 +91,37 @@ class TestBasic(unittest.TestCase): ...@@ -90,17 +91,37 @@ class TestBasic(unittest.TestCase):
'metric' : 'multi_logloss', 'metric' : 'multi_logloss',
'num_class' : 3 'num_class' : 3
} }
gbm = test_template(params, X_y, num_round=20, return_model=True, stratify=X_y[1]) gbm = test_template(params, X_y, num_round=20, return_model=True)
evals_result, ret = test_template(params, X_y, feval=multi_logloss, evals_result, ret = test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm) num_round=80, init_model=gbm)
self.assertLess(ret, 1.5) self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self): def test_cv(self):
lgb_train, lgb_eval = test_template(return_data=True) lgb_train, _ = test_template(return_data=True)
lgb.cv({'verbose':0}, lgb_train, num_boost_round=200, nfold=5, lgb.cv({'verbose':0}, lgb_train, num_boost_round=20, nfold=5,
metrics='l1', verbose_eval=False) metrics='l1', verbose_eval=False)
def test_save_load_copy_pickle(self):
gbm = test_template(num_round=20, return_model=True)
_, ret_origin = test_template(init_model=gbm)
other_ret = []
gbm.save_model('lgb.model')
other_ret.append(test_template(init_model='lgb.model')[1])
gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(test_template(init_model=gbm_load)[1])
other_ret.append(test_template(init_model=copy.copy(gbm))[1])
other_ret.append(test_template(init_model=copy.deepcopy(gbm))[1])
with open('lgb.pkl', 'wb') as f:
pickle.dump(gbm, f)
with open('lgb.pkl', 'rb') as f:
gbm_pickle = pickle.load(f)
other_ret.append(test_template(init_model=gbm_pickle)[1])
gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(test_template(init_model=gbm_pickles)[1])
for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_engine.py") print("running test_engine.py")
unittest.main() unittest.main()
...@@ -3,30 +3,30 @@ ...@@ -3,30 +3,30 @@
import os, unittest import os, unittest
import numpy as np import numpy as np
import lightgbm as lgb import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error from sklearn.metrics import log_loss, mean_squared_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris, load_svmlight_file from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_svmlight_file
from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import clone from sklearn.base import clone
from sklearn.externals import joblib
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor, def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
feval=mean_squared_error, stratify=None, num_round=100, return_data=False, feval=mean_squared_error, num_round=100,
return_model=False, init_model=None, custom_obj=None, proba=False): custom_obj=None, predict_proba=False,
X, y = X_y return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
stratify=stratify,
random_state=42)
if return_data: return X_train, X_test, y_train, y_test if return_data: return X_train, X_test, y_train, y_test
if not custom_obj: gbm = model(n_estimators=num_round, silent=True) arguments = {'n_estimators' : num_round, 'silent' : True}
else: gbm = model(n_estimators=num_round, objective=custom_obj, silent=True) if custom_obj: arguments['objective'] = custom_obj
gbm = model(**arguments)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model: return gbm if return_model: return gbm
else: return feval(y_test, gbm.predict_proba(X_test) if proba else gbm.predict(X_test)) else: return feval(y_test, gbm.predict_proba(X_test) if predict_proba else gbm.predict(X_test))
class TestSklearn(unittest.TestCase): class TestSklearn(unittest.TestCase):
def test_binary(self): def test_binary(self):
X_y= load_breast_cancer(True) X_y= load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, stratify=X_y[1], proba=True) ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
def test_regreesion(self): def test_regreesion(self):
...@@ -36,7 +36,7 @@ class TestSklearn(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestSklearn(unittest.TestCase):
X_y = load_digits(10, True) X_y = load_digits(10, True)
def multi_error(y_true, y_pred): def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred) return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error, stratify=X_y[1]) ret = test_template(X_y, lgb.LGBMClassifier, multi_error)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
def test_lambdarank(self): def test_lambdarank(self):
...@@ -90,6 +90,21 @@ class TestSklearn(unittest.TestCase): ...@@ -90,6 +90,21 @@ class TestSklearn(unittest.TestCase):
gbm = test_template(return_model=True) gbm = test_template(return_model=True)
gbm_clone = clone(gbm) gbm_clone = clone(gbm)
def test_joblib(self):
gbm = test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl')
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
X_train, X_test, y_train, y_test = test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result(), gbm_pickle.evals_result())
pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle))
for preds in zip(pred_origin, pred_pickle):
self.assertAlmostEqual(*preds, places=5)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_sklearn.py") print("running test_sklearn.py")
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment