Squash into one commit:

1. merge python-package 2. add dump model to json 3. fix bugs 4. clean code with pylint 5. update python examples

Squash into one commit:
1. merge python-package 2. add dump model to json 3. fix bugs 4. clean code with pylint 5. update python examples
eba6d200 · wxchan · 19e085c9 · eba6d200 · eba6d200 · eba6d200
Commit eba6d200 authored Dec 02, 2016 by wxchan
4 changed files
--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -18,6 +18,7 @@ public:
    if (sigmoid_ <= 0.0) {
      Log::Fatal("Sigmoid parameter %f should be greater than zero", sigmoid_);
    }
+    scale_pos_weight_ = static_cast<score_t>(config.scale_pos_weight);
  }
  ~BinaryLogloss() {}
  void Init(const Metadata& metadata, data_size_t num_data) override {
@@ -55,6 +56,7 @@ public:
        label_weights_[0] = 1.0f;
      }
    }
+    label_weights_[1] *= scale_pos_weight_;
  }

  void GetGradients(const score_t* score, score_t* gradients, score_t* hessians) const override {
@@ -104,6 +106,7 @@ private:
  score_t label_weights_[2];
  /*! \brief Weights for data */
  const float* weights_;
+  score_t scale_pos_weight_;
 };

 }  // namespace LightGBM

--- a/tests/c_api_test/test.py
+++ b/tests/c_api_test/test.py
@@ -16,6 +16,8 @@ def LoadDll():

 LIB = LoadDll()

+LIB.LGBM_GetLastError.restype = ctypes.c_char_p
+
 dtype_float32 = 0
 dtype_float64 = 1
 dtype_int32 = 2
@@ -33,9 +35,10 @@ def test_load_from_file(filename, reference):
    if reference != None:
        ref = ctypes.byref(reference)
    handle = ctypes.c_void_p()
-    LIB.LGBM_CreateDatasetFromFile(c_str(filename), 
+    LIB.LGBM_DatasetCreateFromFile(c_str(filename), 
        c_str('max_bin=15'), 
        ref, ctypes.byref(handle) )
+    print(LIB.LGBM_GetLastError())
    num_data = ctypes.c_long()
    LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
    num_feature = ctypes.c_long()
@@ -46,15 +49,6 @@ def test_load_from_file(filename, reference):
 def test_save_to_binary(handle, filename):
    LIB.LGBM_DatasetSaveBinary(handle, c_str(filename))

-def test_load_from_binary(filename):
-    handle = ctypes.c_void_p()
-    LIB.LGBM_CreateDatasetFromBinaryFile(c_str(filename), ctypes.byref(handle) )
-    num_data = ctypes.c_long()
-    LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
-    num_feature = ctypes.c_long()
-    LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) )
-    print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) ) 
-    return handle

 def test_load_from_csr(filename, reference):
    data = []
@@ -72,7 +66,7 @@ def test_load_from_csr(filename, reference):
    if reference != None:
        ref = ctypes.byref(reference)

-    LIB.LGBM_CreateDatasetFromCSR(c_array(ctypes.c_int, csr.indptr), 
+    LIB.LGBM_DatasetCreateFromCSR(c_array(ctypes.c_int, csr.indptr), 
        dtype_int32, 
        c_array(ctypes.c_int, csr.indices), 
        csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
@@ -107,7 +101,7 @@ def test_load_from_csc(filename, reference):
    if reference != None:
        ref = ctypes.byref(reference)

-    LIB.LGBM_CreateDatasetFromCSC(c_array(ctypes.c_int, csr.indptr), 
+    LIB.LGBM_DatasetCreateFromCSC(c_array(ctypes.c_int, csr.indptr), 
        dtype_int32, 
        c_array(ctypes.c_int, csr.indices), 
        csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
@@ -142,7 +136,7 @@ def test_load_from_mat(filename, reference):
    if reference != None:
        ref = ctypes.byref(reference)

-    LIB.LGBM_CreateDatasetFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), 
+    LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), 
        dtype_float64,
        mat.shape[0],
        mat.shape[1],
@@ -170,35 +164,36 @@ def test_dataset():
    test_free_dataset(test)
    test_save_to_binary(train, 'train.binary.bin')
    test_free_dataset(train)
-    train  = test_load_from_binary('train.binary.bin')
+    train  = test_load_from_file('train.binary.bin', None)
    test_free_dataset(train)
 def test_booster():
    train = test_load_from_mat('../../examples/binary_classification/binary.train', None)
-    test = [test_load_from_mat('../../examples/binary_classification/binary.test', train)]
-    name = [c_str('test')]
+    test = test_load_from_mat('../../examples/binary_classification/binary.test', train)
    booster = ctypes.c_void_p()
-    LIB.LGBM_BoosterCreate(train, c_array(ctypes.c_void_p, test), c_array(ctypes.c_char_p, name), 
-        len(test), c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
+    LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
+    LIB.LGBM_BoosterAddValidData(booster, test)
    is_finished = ctypes.c_int(0)
    for i in range(100):
        LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished))
        result = np.array([0.0], dtype=np.float32)
        out_len = ctypes.c_ulong(0)
-        LIB.LGBM_BoosterEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
+        LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
        print ('%d Iteration test AUC %f' %(i, result[0]))
    LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt'))
    LIB.LGBM_BoosterFree(booster)
    test_free_dataset(train)
-    test_free_dataset(test[0])
+    test_free_dataset(test)
    booster2 = ctypes.c_void_p()
-    LIB.LGBM_BoosterLoadFromModelfile(c_str('model.txt'), ctypes.byref(booster2))
+    num_total_model = ctypes.c_long()
+    LIB.LGBM_BoosterCreateFromModelfile(c_str('model.txt'), ctypes.byref(num_total_model), ctypes.byref(booster2))
    data = []
    inp = open('../../examples/binary_classification/binary.test', 'r')
    for line in inp.readlines():
        data.append( [float(x) for x in line.split('\t')[1:]] )
    inp.close()
    mat = np.array(data)
-    preb = np.zeros(( mat.shape[0],1 ), dtype=np.float64)
+    preb = np.zeros(mat.shape[0], dtype=np.float32)
+    num_preb = ctypes.c_long()
    data = np.array(mat.reshape(mat.size), copy=False)
    LIB.LGBM_BoosterPredictForMat(booster2,
        data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), 
@@ -208,10 +203,10 @@ def test_booster():
        1,
        1,
        50,
+        ctypes.byref(num_preb),
        preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
-    LIB.LGBM_BoosterPredictForFile(booster2, 1, 50, 0, c_str('../../examples/binary_classification/binary.test'), c_str('preb.txt'))
+    LIB.LGBM_BoosterPredictForFile(booster2,c_str('../../examples/binary_classification/binary.test'),0 , 0, 50, c_str('preb.txt'))
    LIB.LGBM_BoosterFree(booster2)

 test_dataset()
 test_booster()
-
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
+# coding: utf-8
+import numpy as np
+from sklearn import datasets, metrics, model_selection
+import lightgbm as lgb
+
+X, Y = datasets.make_classification(n_samples=100000, n_features=100, random_state=42)
+x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1, random_state=42)
+
+train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)
+valid_data = train_data.create_valid(x_test, label=y_test)
+
+config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}
+bst = lgb.Booster(params=config, train_set=train_data)
+bst.add_valid(valid_data,"valid_1")
+
+for i in range(100):
+	bst.update()
+	if i % 10 == 0:
+		print(bst.eval_train())
+		print(bst.eval_valid())
+bst.save_model("model.txt")
+
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
+import numpy as np
+import random
+import lightgbm as lgb
+
+
+rng = np.random.RandomState(2016)
+
+def test_binary_classification():
+
+    from sklearn import datasets, metrics, model_selection
+
+    X, y = datasets.make_classification(n_samples=10000, n_features=100)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+    lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
+    from sklearn.datasets import load_digits
+    digits = load_digits(2)
+    y = digits['target']
+    X = digits['data']
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+    lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
+    preds = lgb_model.predict(x_test)
+    err = sum(1 for i in range(len(preds))
+          if int(preds[i] > 0.5) != y_test[i]) / float(len(preds))
+    assert err < 0.1
+
+def test_multiclass_classification():
+    from sklearn.datasets import load_iris
+    from sklearn import datasets, metrics, model_selection
+
+    def check_pred(preds, labels):
+        err = sum(1 for i in range(len(preds))
+                  if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+        assert err < 0.7
+
+
+    X, y = datasets.make_classification(n_samples=10000, n_features=100, n_classes=4, n_informative=3)
+
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+
+    lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
+    preds = lgb_model.predict(x_test)
+
+    check_pred(preds, y_test)
+
+def test_regression():
+    from sklearn.metrics import mean_squared_error
+    from sklearn.datasets import load_boston
+    from sklearn.cross_validation import KFold
+    from sklearn import datasets, metrics, model_selection
+
+    boston = load_boston()
+    y = boston['target']
+    X = boston['data']
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+    lgb_model = lgb.LGBMRegressor().fit(x_train, y_train)
+    preds = lgb_model.predict(x_test)
+    assert mean_squared_error(preds, y_test) < 100
+
+def test_regression_with_custom_objective():
+    from sklearn.metrics import mean_squared_error
+    from sklearn.datasets import load_boston
+    from sklearn.cross_validation import KFold
+    from sklearn import datasets, metrics, model_selection
+    def objective_ls(y_true, y_pred):
+        grad = (y_pred - y_true)
+        hess = np.ones(len(y_true))
+        return grad, hess
+    boston = load_boston()
+    y = boston['target']
+    X = boston['data']
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+    lgb_model = lgb.LGBMRegressor(objective=objective_ls).fit(x_train, y_train)
+    preds = lgb_model.predict(x_test)
+    assert mean_squared_error(preds, y_test) < 100
+
+
+def test_binary_classification_with_custom_objective():
+
+    from sklearn import datasets, metrics, model_selection
+    def logregobj(y_true, y_pred):
+        y_pred = 1.0 / (1.0 + np.exp(-y_pred))
+        grad = y_pred - y_true
+        hess = y_pred * (1.0 - y_pred)
+        return grad, hess
+    X, y = datasets.make_classification(n_samples=10000, n_features=100)
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+    lgb_model = lgb.LGBMClassifier(objective=logregobj).fit(x_train, y_train)
+    from sklearn.datasets import load_digits
+    digits = load_digits(2)
+    y = digits['target']
+    X = digits['data']
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
+    lgb_model = lgb.LGBMClassifier(objective=logregobj).fit(x_train, y_train)
+    preds = lgb_model.predict(x_test)
+    err = sum(1 for i in range(len(preds))
+          if int(preds[i] > 0.5) != y_test[i]) / float(len(preds))
+    assert err < 0.1
+
+def test_early_stopping():
+    from sklearn.metrics import mean_squared_error
+    from sklearn.datasets import load_boston
+    from sklearn.cross_validation import KFold
+    from sklearn import datasets, metrics, model_selection
+
+    boston = load_boston()
+    y = boston['target']
+    X = boston['data']
+    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
+    lgb_model = lgb.LGBMRegressor(n_estimators=500) \
+            .fit(x_train, y_train, eval_set=[(x_test, y_test)], 
+                eval_metric='l2', 
+                early_stopping_rounds=10,
+                verbose=10)
+    print(lgb_model.best_iteration)
+
+test_binary_classification()
+test_multiclass_classification()
+test_regression()
+test_regression_with_custom_objective()
+test_binary_classification_with_custom_objective()
+test_early_stopping()
\ No newline at end of file