support pickle (#151)

* support pickle * add pickle/joblib test; change test_basic to unittest * remove file for deepcopy * fix tests * test basic predict from file * Revert "test basic predict from file" This reverts commit 60d2c3158537fd56081f60f1d6d120cedd782887. * test predict from file * use tempfile for copy & pickle * use tempfile w/o binary mode * clean test

support pickle (#151)
* support pickle * add pickle/joblib test; change test_basic to unittest * remove file for deepcopy * fix tests * test basic predict from file * Revert "test basic predict from file" This reverts commit 60d2c3158537fd56081f60f1d6d120cedd782887. * test predict from file * use tempfile for copy & pickle * use tempfile w/o binary mode * clean test
a034ceeb · wxchan · Guolin Ke · 1d7acf57 · a034ceeb · a034ceeb
Commit a034ceeb authored Jan 01, 2017 by wxchan Committed by Guolin Ke Jan 01, 2017
4 changed files
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -7,8 +7,8 @@ from __future__ import absolute_import

 import sys
 import ctypes
-import tempfile
 import json
+from tempfile import NamedTemporaryFile

 import numpy as np
 import scipy.sparse
@@ -276,16 +276,15 @@ class _InnerPredictor(object):
        if num_iteration > self.num_total_iteration:
            num_iteration = self.num_total_iteration
        if is_str(data):
-            tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
+            with NamedTemporaryFile(mode='w+') as f:
                _safe_call(_LIB.LGBM_BoosterPredictForFile(
                    self.handle,
                    c_str(data),
                    int_data_has_header,
                    predict_type,
                    num_iteration,
-                c_str(tmp_pred_fname)))
-            with open(tmp_pred_fname, "r") as tmp_file:
-                lines = tmp_file.readlines()
+                    c_str(f.name)))
+                lines = f.readlines()
                nrow = len(lines)
                preds = [float(token) for line in lines for token in line.split('\t')]
                preds = np.array(preds, dtype=np.float64, copy=False)
@@ -1333,6 +1332,40 @@ class Booster(object):
        if self.handle is not None:
            _safe_call(_LIB.LGBM_BoosterFree(self.handle))

+    def __copy__(self):
+        return self.__deepcopy__(None)
+
+    def __deepcopy__(self, _):
+        with NamedTemporaryFile(mode='w+') as f:
+            self.save_model(f.name)
+            return Booster(model_file=f.name)
+
+    def __getstate__(self):
+        this = self.__dict__.copy()
+        handle = this['handle']
+        this.pop('train_set', None)
+        this.pop('valid_sets', None)
+        if handle is not None:
+            with NamedTemporaryFile(mode='w+') as f:
+                self.save_model(f.name)
+                this["handle"] = f.readlines()
+        return this
+
+    def __setstate__(self, state):
+        model = state['handle']
+        if model is not None:
+            handle = ctypes.c_void_p()
+            out_num_iterations = ctypes.c_int64(0)
+            with NamedTemporaryFile(mode='w+') as f:
+                f.writelines(model)
+                f.flush()
+                _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
+                    c_str(f.name),
+                    ctypes.byref(out_num_iterations),
+                    ctypes.byref(handle)))
+            state['handle'] = handle
+        self.__dict__.update(state)
+
    def set_train_data_name(self, name):
        self.__train_data_name = name


--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
 # coding: utf-8
+# pylint: skip-file
+import unittest, tempfile
 import numpy as np
-from sklearn import datasets, metrics, model_selection
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
 import lightgbm as lgb

-X, Y = datasets.make_classification(n_samples=100000, n_features=100)
-x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)
+class TestBasic(unittest.TestCase):

-train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)
+    def test(self):
+        X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1)

-valid_data = train_data.create_valid(x_test, label=y_test)
+        train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
+        valid_data = train_data.create_valid(X_test, label=y_test)

-config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}
-bst = lgb.Booster(params=config, train_set=train_data)
-bst.add_valid(valid_data,"valid_1")
+        params = {
+            "objective" : "binary",
+            "metric" : "auc",
+            "min_data" : 1,
+            "num_leaves" : 15,
+            "verbose" : -1
+        }
+        bst = lgb.Booster(params, train_data)
+        bst.add_valid(valid_data, "valid_1")

-for i in range(100):
+        for i in range(30):
            bst.update()
            if i % 10 == 0:
-		print(bst.eval_train())
-		print(bst.eval_valid())
-bst.save_model("model.txt")
+                print(bst.eval_train(), bst.eval_valid())
+        bst.save_model("model.txt")
+        pred_from_matr = bst.predict(X_test)
+        with tempfile.NamedTemporaryFile() as f:
+            np.savetxt(f, X_test, delimiter=',')
+            f.flush()
+            pred_from_file = bst.predict(f.name)
+        self.assertEqual(len(pred_from_matr), len(pred_from_file))
+        for preds in zip(pred_from_matr, pred_from_file):
+            self.assertAlmostEqual(*preds, places=5)

+print("----------------------------------------------------------------------")
+print("running test_basic.py")
+unittest.main()
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
 # coding: utf-8
 # pylint: skip-file
-import os, unittest, math
+import os, unittest, math, copy
 import numpy as np
 import lightgbm as lgb
 from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error
 from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris
 from sklearn.model_selection import train_test_split
+try:
+    import cPickle as pickle
+except:
+    import pickle

 def multi_logloss(y_true, y_pred):
    return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])

 def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
                  X_y=load_boston(True), feval=mean_squared_error,
-                stratify=None, num_round=100, return_data=False,
-                return_model=False, init_model=None, custom_eval=None):
-    X, y = X_y
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        stratify=stratify,
-                                                        random_state=42)
-    lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=not return_model, params=params)
-    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=not return_model, params=params)
+                  num_round=100, init_model=None, custom_eval=None,
+                  return_data=False, return_model=False):
+    X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
+    lgb_train = lgb.Dataset(X_train, y_train, params=params)
+    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
    if return_data: return lgb_train, lgb_eval
    evals_result = {}
-    params['verbose'] = 0
+    params['verbose'] = params['seed'] = 0
    gbm = lgb.train(params, lgb_train,
                    num_boost_round=num_round,
                    valid_sets=lgb_eval,
@@ -35,7 +36,7 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
    if return_model: return gbm
    else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))

-class TestBasic(unittest.TestCase):
+class TestEngine(unittest.TestCase):

    def test_binary(self):
        X_y= load_breast_cancer(True)
@@ -43,7 +44,7 @@ class TestBasic(unittest.TestCase):
            'objective' : 'binary',
            'metric' : 'binary_logloss'
        }
-        evals_result, ret = test_template(params, X_y, log_loss, stratify=X_y[1])
+        evals_result, ret = test_template(params, X_y, log_loss)
        self.assertLess(ret, 0.15)
        self.assertAlmostEqual(min(evals_result['eval']['logloss']), ret, places=5)

@@ -60,7 +61,7 @@ class TestBasic(unittest.TestCase):
            'metric' : 'multi_logloss',
            'num_class' : 10
        }
-        evals_result, ret = test_template(params, X_y, multi_logloss, stratify=X_y[1])
+        evals_result, ret = test_template(params, X_y, multi_logloss)
        self.assertLess(ret, 0.2)
        self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)

@@ -90,17 +91,37 @@ class TestBasic(unittest.TestCase):
            'metric' : 'multi_logloss',
            'num_class' : 3
        }
-        gbm = test_template(params, X_y, num_round=20, return_model=True, stratify=X_y[1])
+        gbm = test_template(params, X_y, num_round=20, return_model=True)
        evals_result, ret = test_template(params, X_y, feval=multi_logloss,
                                        num_round=80, init_model=gbm)
        self.assertLess(ret, 1.5)
        self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)

    def test_cv(self):
-        lgb_train, lgb_eval = test_template(return_data=True)
-        lgb.cv({'verbose':0}, lgb_train, num_boost_round=200, nfold=5,
+        lgb_train, _ = test_template(return_data=True)
+        lgb.cv({'verbose':0}, lgb_train, num_boost_round=20, nfold=5,
               metrics='l1', verbose_eval=False)

+    def test_save_load_copy_pickle(self):
+        gbm = test_template(num_round=20, return_model=True)
+        _, ret_origin = test_template(init_model=gbm)
+        other_ret = []
+        gbm.save_model('lgb.model')
+        other_ret.append(test_template(init_model='lgb.model')[1])
+        gbm_load = lgb.Booster(model_file='lgb.model')
+        other_ret.append(test_template(init_model=gbm_load)[1])
+        other_ret.append(test_template(init_model=copy.copy(gbm))[1])
+        other_ret.append(test_template(init_model=copy.deepcopy(gbm))[1])
+        with open('lgb.pkl', 'wb') as f:
+            pickle.dump(gbm, f)
+        with open('lgb.pkl', 'rb') as f:
+            gbm_pickle = pickle.load(f)
+        other_ret.append(test_template(init_model=gbm_pickle)[1])
+        gbm_pickles = pickle.loads(pickle.dumps(gbm))
+        other_ret.append(test_template(init_model=gbm_pickles)[1])
+        for ret in other_ret:
+            self.assertAlmostEqual(ret_origin, ret, places=5)
+
 print("----------------------------------------------------------------------")
 print("running test_engine.py")
 unittest.main()
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -3,30 +3,30 @@
 import os, unittest
 import numpy as np
 import lightgbm as lgb
-from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error
-from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris, load_svmlight_file
+from sklearn.metrics import log_loss, mean_squared_error
+from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_svmlight_file
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.base import clone
+from sklearn.externals import joblib

 def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
-                feval=mean_squared_error, stratify=None, num_round=100, return_data=False,
-                return_model=False, init_model=None, custom_obj=None, proba=False):
-    X, y = X_y
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
-                                                        stratify=stratify,
-                                                        random_state=42)
+                  feval=mean_squared_error, num_round=100,
+                  custom_obj=None, predict_proba=False,
+                  return_data=False, return_model=False):
+    X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
    if return_data: return X_train, X_test, y_train, y_test
-    if not custom_obj: gbm = model(n_estimators=num_round, silent=True)
-    else: gbm = model(n_estimators=num_round, objective=custom_obj, silent=True)
+    arguments = {'n_estimators' : num_round, 'silent' : True}
+    if custom_obj: arguments['objective'] = custom_obj
+    gbm = model(**arguments)
    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
    if return_model: return gbm
-    else: return feval(y_test, gbm.predict_proba(X_test) if proba else gbm.predict(X_test))
+    else: return feval(y_test, gbm.predict_proba(X_test) if predict_proba else gbm.predict(X_test))

 class TestSklearn(unittest.TestCase):

    def test_binary(self):
        X_y= load_breast_cancer(True)
-        ret = test_template(X_y, lgb.LGBMClassifier, log_loss, stratify=X_y[1], proba=True)
+        ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
        self.assertLess(ret, 0.15)

    def test_regreesion(self):
@@ -36,7 +36,7 @@ class TestSklearn(unittest.TestCase):
        X_y = load_digits(10, True)
        def multi_error(y_true, y_pred):
            return np.mean(y_true != y_pred)
-        ret = test_template(X_y, lgb.LGBMClassifier, multi_error, stratify=X_y[1])
+        ret = test_template(X_y, lgb.LGBMClassifier, multi_error)
        self.assertLess(ret, 0.2)

    def test_lambdarank(self):
@@ -90,6 +90,21 @@ class TestSklearn(unittest.TestCase):
        gbm = test_template(return_model=True)
        gbm_clone = clone(gbm)

+    def test_joblib(self):
+        gbm = test_template(num_round=10, return_model=True)
+        joblib.dump(gbm, 'lgb.pkl')
+        gbm_pickle = joblib.load('lgb.pkl')
+        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
+        X_train, X_test, y_train, y_test = test_template(return_data=True)
+        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
+        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
+        self.assertDictEqual(gbm.evals_result(), gbm_pickle.evals_result())
+        pred_origin = gbm.predict(X_test)
+        pred_pickle = gbm_pickle.predict(X_test)
+        self.assertEqual(len(pred_origin), len(pred_pickle))
+        for preds in zip(pred_origin, pred_pickle):
+            self.assertAlmostEqual(*preds, places=5)
+
 print("----------------------------------------------------------------------")
 print("running test_sklearn.py")
 unittest.main()