Commit eba6d200 authored by wxchan's avatar wxchan
Browse files

Squash into one commit:

1. merge python-package
2. add dump model to json
3. fix bugs
4. clean code with pylint
5. update python examples
parent 19e085c9
......@@ -18,6 +18,7 @@ public:
if (sigmoid_ <= 0.0) {
Log::Fatal("Sigmoid parameter %f should be greater than zero", sigmoid_);
}
scale_pos_weight_ = static_cast<score_t>(config.scale_pos_weight);
}
~BinaryLogloss() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
......@@ -55,6 +56,7 @@ public:
label_weights_[0] = 1.0f;
}
}
label_weights_[1] *= scale_pos_weight_;
}
void GetGradients(const score_t* score, score_t* gradients, score_t* hessians) const override {
......@@ -104,6 +106,7 @@ private:
score_t label_weights_[2];
/*! \brief Weights for data */
const float* weights_;
score_t scale_pos_weight_;
};
} // namespace LightGBM
......
......@@ -16,6 +16,8 @@ def LoadDll():
LIB = LoadDll()
LIB.LGBM_GetLastError.restype = ctypes.c_char_p
dtype_float32 = 0
dtype_float64 = 1
dtype_int32 = 2
......@@ -33,9 +35,10 @@ def test_load_from_file(filename, reference):
if reference != None:
ref = ctypes.byref(reference)
handle = ctypes.c_void_p()
LIB.LGBM_CreateDatasetFromFile(c_str(filename),
LIB.LGBM_DatasetCreateFromFile(c_str(filename),
c_str('max_bin=15'),
ref, ctypes.byref(handle) )
print(LIB.LGBM_GetLastError())
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
num_feature = ctypes.c_long()
......@@ -46,15 +49,6 @@ def test_load_from_file(filename, reference):
def test_save_to_binary(handle, filename):
LIB.LGBM_DatasetSaveBinary(handle, c_str(filename))
def test_load_from_binary(filename):
handle = ctypes.c_void_p()
LIB.LGBM_CreateDatasetFromBinaryFile(c_str(filename), ctypes.byref(handle) )
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) )
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) )
return handle
def test_load_from_csr(filename, reference):
data = []
......@@ -72,7 +66,7 @@ def test_load_from_csr(filename, reference):
if reference != None:
ref = ctypes.byref(reference)
LIB.LGBM_CreateDatasetFromCSR(c_array(ctypes.c_int, csr.indptr),
LIB.LGBM_DatasetCreateFromCSR(c_array(ctypes.c_int, csr.indptr),
dtype_int32,
c_array(ctypes.c_int, csr.indices),
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
......@@ -107,7 +101,7 @@ def test_load_from_csc(filename, reference):
if reference != None:
ref = ctypes.byref(reference)
LIB.LGBM_CreateDatasetFromCSC(c_array(ctypes.c_int, csr.indptr),
LIB.LGBM_DatasetCreateFromCSC(c_array(ctypes.c_int, csr.indptr),
dtype_int32,
c_array(ctypes.c_int, csr.indices),
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
......@@ -142,7 +136,7 @@ def test_load_from_mat(filename, reference):
if reference != None:
ref = ctypes.byref(reference)
LIB.LGBM_CreateDatasetFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64,
mat.shape[0],
mat.shape[1],
......@@ -170,35 +164,36 @@ def test_dataset():
test_free_dataset(test)
test_save_to_binary(train, 'train.binary.bin')
test_free_dataset(train)
train = test_load_from_binary('train.binary.bin')
train = test_load_from_file('train.binary.bin', None)
test_free_dataset(train)
def test_booster():
train = test_load_from_mat('../../examples/binary_classification/binary.train', None)
test = [test_load_from_mat('../../examples/binary_classification/binary.test', train)]
name = [c_str('test')]
test = test_load_from_mat('../../examples/binary_classification/binary.test', train)
booster = ctypes.c_void_p()
LIB.LGBM_BoosterCreate(train, c_array(ctypes.c_void_p, test), c_array(ctypes.c_char_p, name),
len(test), c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
LIB.LGBM_BoosterCreate(train, c_str("app=binary metric=auc num_leaves=31 verbose=0"), ctypes.byref(booster))
LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0)
for i in range(100):
LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float32)
out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
print ('%d Iteration test AUC %f' %(i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster)
test_free_dataset(train)
test_free_dataset(test[0])
test_free_dataset(test)
booster2 = ctypes.c_void_p()
LIB.LGBM_BoosterLoadFromModelfile(c_str('model.txt'), ctypes.byref(booster2))
num_total_model = ctypes.c_long()
LIB.LGBM_BoosterCreateFromModelfile(c_str('model.txt'), ctypes.byref(num_total_model), ctypes.byref(booster2))
data = []
inp = open('../../examples/binary_classification/binary.test', 'r')
for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] )
inp.close()
mat = np.array(data)
preb = np.zeros(( mat.shape[0],1 ), dtype=np.float64)
preb = np.zeros(mat.shape[0], dtype=np.float32)
num_preb = ctypes.c_long()
data = np.array(mat.reshape(mat.size), copy=False)
LIB.LGBM_BoosterPredictForMat(booster2,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
......@@ -208,10 +203,10 @@ def test_booster():
1,
1,
50,
ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterPredictForFile(booster2, 1, 50, 0, c_str('../../examples/binary_classification/binary.test'), c_str('preb.txt'))
LIB.LGBM_BoosterPredictForFile(booster2,c_str('../../examples/binary_classification/binary.test'),0 , 0, 50, c_str('preb.txt'))
LIB.LGBM_BoosterFree(booster2)
test_dataset()
test_booster()
# coding: utf-8
import numpy as np
from sklearn import datasets, metrics, model_selection
import lightgbm as lgb
X, Y = datasets.make_classification(n_samples=100000, n_features=100, random_state=42)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1, random_state=42)
train_data = lgb.Dataset(x_train, max_bin=255, label=y_train)
valid_data = train_data.create_valid(x_test, label=y_test)
config={"objective":"binary","metric":"auc", "min_data":1, "num_leaves":15}
bst = lgb.Booster(params=config, train_set=train_data)
bst.add_valid(valid_data,"valid_1")
for i in range(100):
bst.update()
if i % 10 == 0:
print(bst.eval_train())
print(bst.eval_valid())
bst.save_model("model.txt")
import numpy as np
import random
import lightgbm as lgb
rng = np.random.RandomState(2016)
def test_binary_classification():
from sklearn import datasets, metrics, model_selection
X, y = datasets.make_classification(n_samples=10000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
from sklearn.datasets import load_digits
digits = load_digits(2)
y = digits['target']
X = digits['data']
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
preds = lgb_model.predict(x_test)
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != y_test[i]) / float(len(preds))
assert err < 0.1
def test_multiclass_classification():
from sklearn.datasets import load_iris
from sklearn import datasets, metrics, model_selection
def check_pred(preds, labels):
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.7
X, y = datasets.make_classification(n_samples=10000, n_features=100, n_classes=4, n_informative=3)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier().fit(x_train, y_train)
preds = lgb_model.predict(x_test)
check_pred(preds, y_test)
def test_regression():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold
from sklearn import datasets, metrics, model_selection
boston = load_boston()
y = boston['target']
X = boston['data']
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMRegressor().fit(x_train, y_train)
preds = lgb_model.predict(x_test)
assert mean_squared_error(preds, y_test) < 100
def test_regression_with_custom_objective():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold
from sklearn import datasets, metrics, model_selection
def objective_ls(y_true, y_pred):
grad = (y_pred - y_true)
hess = np.ones(len(y_true))
return grad, hess
boston = load_boston()
y = boston['target']
X = boston['data']
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMRegressor(objective=objective_ls).fit(x_train, y_train)
preds = lgb_model.predict(x_test)
assert mean_squared_error(preds, y_test) < 100
def test_binary_classification_with_custom_objective():
from sklearn import datasets, metrics, model_selection
def logregobj(y_true, y_pred):
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
grad = y_pred - y_true
hess = y_pred * (1.0 - y_pred)
return grad, hess
X, y = datasets.make_classification(n_samples=10000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier(objective=logregobj).fit(x_train, y_train)
from sklearn.datasets import load_digits
digits = load_digits(2)
y = digits['target']
X = digits['data']
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
lgb_model = lgb.LGBMClassifier(objective=logregobj).fit(x_train, y_train)
preds = lgb_model.predict(x_test)
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != y_test[i]) / float(len(preds))
assert err < 0.1
def test_early_stopping():
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold
from sklearn import datasets, metrics, model_selection
boston = load_boston()
y = boston['target']
X = boston['data']
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMRegressor(n_estimators=500) \
.fit(x_train, y_train, eval_set=[(x_test, y_test)],
eval_metric='l2',
early_stopping_rounds=10,
verbose=10)
print(lgb_model.best_iteration)
test_binary_classification()
test_multiclass_classification()
test_regression()
test_regression_with_custom_objective()
test_binary_classification_with_custom_objective()
test_early_stopping()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment