Commit a034ceeb authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

support pickle (#151)

* support pickle

* add pickle/joblib test; change test_basic to unittest

* remove file for deepcopy

* fix tests

* test basic predict from file

* Revert "test basic predict from file"

This reverts commit 60d2c3158537fd56081f60f1d6d120cedd782887.

* test predict from file

* use tempfile for copy & pickle

* use tempfile w/o binary mode

* clean test
parent 1d7acf57
......@@ -7,8 +7,8 @@ from __future__ import absolute_import
import sys
import ctypes
import tempfile
import json
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.sparse
......@@ -276,16 +276,15 @@ class _InnerPredictor(object):
if num_iteration > self.num_total_iteration:
num_iteration = self.num_total_iteration
if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
with NamedTemporaryFile(mode='w+') as f:
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
int_data_has_header,
predict_type,
num_iteration,
c_str(tmp_pred_fname)))
with open(tmp_pred_fname, "r") as tmp_file:
lines = tmp_file.readlines()
c_str(f.name)))
lines = f.readlines()
nrow = len(lines)
preds = [float(token) for line in lines for token in line.split('\t')]
preds = np.array(preds, dtype=np.float64, copy=False)
......@@ -1333,6 +1332,40 @@ class Booster(object):
if self.handle is not None:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def __copy__(self):
return self.__deepcopy__(None)
def __deepcopy__(self, _):
with NamedTemporaryFile(mode='w+') as f:
self.save_model(f.name)
return Booster(model_file=f.name)
def __getstate__(self):
this = self.__dict__.copy()
handle = this['handle']
this.pop('train_set', None)
this.pop('valid_sets', None)
if handle is not None:
with NamedTemporaryFile(mode='w+') as f:
self.save_model(f.name)
this["handle"] = f.readlines()
return this
def __setstate__(self, state):
model = state['handle']
if model is not None:
handle = ctypes.c_void_p()
out_num_iterations = ctypes.c_int64(0)
with NamedTemporaryFile(mode='w+') as f:
f.writelines(model)
f.flush()
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(f.name),
ctypes.byref(out_num_iterations),
ctypes.byref(handle)))
state['handle'] = handle
self.__dict__.update(state)
def set_train_data_name(self, name):
self.__train_data_name = name
......
# coding: utf-8
# pylint: skip-file
import unittest, tempfile
import numpy as np
from sklearn import datasets, metrics, model_selection
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
X, Y = datasets.make_classification(n_samples=100000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)
class TestBasic(unittest.TestCase):
train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)
def test(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1)
valid_data = train_data.create_valid(x_test, label=y_test)
train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
valid_data = train_data.create_valid(X_test, label=y_test)
config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}
bst = lgb.Booster(params=config, train_set=train_data)
bst.add_valid(valid_data,"valid_1")
params = {
"objective" : "binary",
"metric" : "auc",
"min_data" : 1,
"num_leaves" : 15,
"verbose" : -1
}
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1")
for i in range(100):
for i in range(30):
bst.update()
if i % 10 == 0:
print(bst.eval_train())
print(bst.eval_valid())
bst.save_model("model.txt")
print(bst.eval_train(), bst.eval_valid())
bst.save_model("model.txt")
pred_from_matr = bst.predict(X_test)
with tempfile.NamedTemporaryFile() as f:
np.savetxt(f, X_test, delimiter=',')
f.flush()
pred_from_file = bst.predict(f.name)
self.assertEqual(len(pred_from_matr), len(pred_from_file))
for preds in zip(pred_from_matr, pred_from_file):
self.assertAlmostEqual(*preds, places=5)
print("----------------------------------------------------------------------")
print("running test_basic.py")
unittest.main()
# coding: utf-8
# pylint: skip-file
import os, unittest, math
import os, unittest, math, copy
import numpy as np
import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris
from sklearn.model_selection import train_test_split
try:
import cPickle as pickle
except:
import pickle
def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
X_y=load_boston(True), feval=mean_squared_error,
stratify=None, num_round=100, return_data=False,
return_model=False, init_model=None, custom_eval=None):
X, y = X_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
stratify=stratify,
random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=not return_model, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=not return_model, params=params)
num_round=100, init_model=None, custom_eval=None,
return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data: return lgb_train, lgb_eval
evals_result = {}
params['verbose'] = 0
params['verbose'] = params['seed'] = 0
gbm = lgb.train(params, lgb_train,
num_boost_round=num_round,
valid_sets=lgb_eval,
......@@ -35,7 +36,7 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
if return_model: return gbm
else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestBasic(unittest.TestCase):
class TestEngine(unittest.TestCase):
def test_binary(self):
X_y= load_breast_cancer(True)
......@@ -43,7 +44,7 @@ class TestBasic(unittest.TestCase):
'objective' : 'binary',
'metric' : 'binary_logloss'
}
evals_result, ret = test_template(params, X_y, log_loss, stratify=X_y[1])
evals_result, ret = test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15)
self.assertAlmostEqual(min(evals_result['eval']['logloss']), ret, places=5)
......@@ -60,7 +61,7 @@ class TestBasic(unittest.TestCase):
'metric' : 'multi_logloss',
'num_class' : 10
}
evals_result, ret = test_template(params, X_y, multi_logloss, stratify=X_y[1])
evals_result, ret = test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
......@@ -90,17 +91,37 @@ class TestBasic(unittest.TestCase):
'metric' : 'multi_logloss',
'num_class' : 3
}
gbm = test_template(params, X_y, num_round=20, return_model=True, stratify=X_y[1])
gbm = test_template(params, X_y, num_round=20, return_model=True)
evals_result, ret = test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm)
self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self):
lgb_train, lgb_eval = test_template(return_data=True)
lgb.cv({'verbose':0}, lgb_train, num_boost_round=200, nfold=5,
lgb_train, _ = test_template(return_data=True)
lgb.cv({'verbose':0}, lgb_train, num_boost_round=20, nfold=5,
metrics='l1', verbose_eval=False)
def test_save_load_copy_pickle(self):
gbm = test_template(num_round=20, return_model=True)
_, ret_origin = test_template(init_model=gbm)
other_ret = []
gbm.save_model('lgb.model')
other_ret.append(test_template(init_model='lgb.model')[1])
gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(test_template(init_model=gbm_load)[1])
other_ret.append(test_template(init_model=copy.copy(gbm))[1])
other_ret.append(test_template(init_model=copy.deepcopy(gbm))[1])
with open('lgb.pkl', 'wb') as f:
pickle.dump(gbm, f)
with open('lgb.pkl', 'rb') as f:
gbm_pickle = pickle.load(f)
other_ret.append(test_template(init_model=gbm_pickle)[1])
gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(test_template(init_model=gbm_pickles)[1])
for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5)
print("----------------------------------------------------------------------")
print("running test_engine.py")
unittest.main()
......@@ -3,30 +3,30 @@
import os, unittest
import numpy as np
import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris, load_svmlight_file
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_svmlight_file
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import clone
from sklearn.externals import joblib
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
feval=mean_squared_error, stratify=None, num_round=100, return_data=False,
return_model=False, init_model=None, custom_obj=None, proba=False):
X, y = X_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
stratify=stratify,
random_state=42)
feval=mean_squared_error, num_round=100,
custom_obj=None, predict_proba=False,
return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data: return X_train, X_test, y_train, y_test
if not custom_obj: gbm = model(n_estimators=num_round, silent=True)
else: gbm = model(n_estimators=num_round, objective=custom_obj, silent=True)
arguments = {'n_estimators' : num_round, 'silent' : True}
if custom_obj: arguments['objective'] = custom_obj
gbm = model(**arguments)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model: return gbm
else: return feval(y_test, gbm.predict_proba(X_test) if proba else gbm.predict(X_test))
else: return feval(y_test, gbm.predict_proba(X_test) if predict_proba else gbm.predict(X_test))
class TestSklearn(unittest.TestCase):
def test_binary(self):
X_y= load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, stratify=X_y[1], proba=True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15)
def test_regreesion(self):
......@@ -36,7 +36,7 @@ class TestSklearn(unittest.TestCase):
X_y = load_digits(10, True)
def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error, stratify=X_y[1])
ret = test_template(X_y, lgb.LGBMClassifier, multi_error)
self.assertLess(ret, 0.2)
def test_lambdarank(self):
......@@ -90,6 +90,21 @@ class TestSklearn(unittest.TestCase):
gbm = test_template(return_model=True)
gbm_clone = clone(gbm)
def test_joblib(self):
gbm = test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl')
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
X_train, X_test, y_train, y_test = test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result(), gbm_pickle.evals_result())
pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle))
for preds in zip(pred_origin, pred_pickle):
self.assertAlmostEqual(*preds, places=5)
print("----------------------------------------------------------------------")
print("running test_sklearn.py")
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment