Commit dd425973 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

python code style with pep8 (#161)

* format python code with pep8

* **DO NOT MERGE** deliberately break rules to see what will happen during check

* Revert "**DO NOT MERGE** deliberately break rules to see what will happen during check"

This reverts commit 0db93cd7a43c7efa43a2112ada43d46c6f9115d9.

* fix format in test.py

* add docs for pep-8
parent c4778e73
......@@ -13,8 +13,9 @@ before_install:
- conda update -q conda
install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn
- pip install pep8
script:
......@@ -23,9 +24,9 @@ script:
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR
- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 .
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
......
......@@ -27,15 +27,15 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
# specify your configurations as a dict
params = {
'boosting_type' : 'gbdt',
'objective' : 'binary',
'metric' : 'binary_logloss',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose' : 0
'verbose': 0
}
# generate a feature name
......@@ -46,7 +46,7 @@ print('Start training...')
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
......@@ -88,10 +88,11 @@ gbm = lgb.train(params,
num_boost_round=10,
init_model=gbm,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7]*5+[0.6]*5)])
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finish 30 - 40 rounds with changing bagging_fraction...')
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
......@@ -102,6 +103,7 @@ def loglikelood(preds, train_data):
hess = preds * (1. - preds)
return grad, hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
......@@ -109,6 +111,7 @@ def binary_error(preds, train_data):
labels = train_data.get_label()
return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
......@@ -120,6 +123,8 @@ gbm = lgb.train(params,
print('Finish 40 - 50 rounds with self-defined objective function and eval metric...')
print('Start a new training job...')
# callback
def reset_metrics():
def callback(env):
......@@ -131,6 +136,7 @@ def reset_metrics():
callback.order = 0
return callback
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
......
......@@ -21,16 +21,16 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : {'l2', 'auc'},
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'auc'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose' : 0
'verbose': 0
}
print('Start training...')
......
......@@ -31,3 +31,19 @@ Troubleshooting
setup.py directory, *never* absolute paths.
- **Solution 1**: please check `here <http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path>`__.
Developments
--------
The code style of python package follows `pep-8 <https://www.python.org/dev/peps/pep-0008/>`__. If you would like to make a contribution and not familiar with pep-8, please check the pep-8 style guide first. Otherwise, you won't pass the check. You should be careful about:
- E1 Indentation (check pep-8 link above)
- E202 whitespace before and after brackets
- E225 missing whitespace around operator
- E226 missing whitespace around arithmetic operator
- E261 at least two spaces before inline comment
- E301 expected 1 blank line in front of and at the end of a method
- E302 expected 2 blank lines in front of and at the end of a function or a class
You can ignore E501 (line too long).
......@@ -6,8 +6,6 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
from __future__ import absolute_import
import os
from .basic import Dataset, Booster
from .engine import train, cv
from .callback import print_evaluation, record_evaluation, reset_parameter, early_stopping
......@@ -23,4 +21,3 @@ __all__ = ['Dataset', 'Booster',
'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping']
......@@ -5,11 +5,11 @@
"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import
import sys
import ctypes
import json
from tempfile import NamedTemporaryFile
import os
import sys
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.sparse
......@@ -22,11 +22,13 @@ try:
except ImportError:
class Series(object):
pass
class DataFrame(object):
pass
IS_PY3 = (sys.version_info[0] == 3)
def _load_lib():
"""Load LightGBM Library."""
lib_path = find_lib_path()
......@@ -36,12 +38,15 @@ def _load_lib():
lib.LGBM_GetLastError.restype = ctypes.c_char_p
return lib
_LIB = _load_lib()
class LightGBMError(Exception):
"""Error throwed by LightGBM"""
pass
def _safe_call(ret):
"""Check the return value of C API call
Parameters
......@@ -52,6 +57,7 @@ def _safe_call(ret):
if ret != 0:
raise LightGBMError(_LIB.LGBM_GetLastError())
def is_str(s):
"""Check is a str or not"""
if IS_PY3:
......@@ -59,6 +65,7 @@ def is_str(s):
else:
return isinstance(s, basestring)
def is_numeric(obj):
"""Check is a number or not, include numpy number etc."""
try:
......@@ -67,19 +74,23 @@ def is_numeric(obj):
except:
return False
def is_numpy_object(data):
"""Check is numpy object"""
return type(data).__module__ == np.__name__
def is_numpy_1d_array(data):
"""Check is 1d numpy array"""
return isinstance(data, np.ndarray) and len(data.shape) == 1
def is_1d_list(data):
"""Check is 1d list"""
return isinstance(data, list) and \
(not data or isinstance(data[0], (int, float, bool)))
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""convert to 1d numpy array"""
if is_numpy_1d_array(data):
......@@ -94,6 +105,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
else:
raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name))
def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
......@@ -102,6 +114,7 @@ def cfloat32_array_to_numpy(cptr, length):
else:
raise RuntimeError('Expected float pointer')
def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
......@@ -110,44 +123,52 @@ def cint32_array_to_numpy(cptr, length):
else:
raise RuntimeError('Expected int pointer')
def c_str(string):
"""Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values):
"""Convert a python array to c array."""
return (ctype * len(values))(*values)
def param_dict_to_str(data):
if data is None or not data:
return ""
pairs = []
for key, val in data.items():
if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
pairs.append(str(key)+'='+','.join(map(str, val)))
pairs.append(str(key) + '=' + ','.join(map(str, val)))
elif is_str(val) or isinstance(val, (int, float, bool)) or is_numeric(val):
pairs.append(str(key)+'='+str(val))
pairs.append(str(key) + '=' + str(val))
else:
raise TypeError('Unknown type of parameter:%s, got:%s'
% (key, type(val).__name__))
return ' '.join(pairs)
class _temp_file(object):
def __enter__(self):
with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
self.name = f.name
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if os.path.isfile(self.name):
os.remove(self.name)
def readlines(self):
with open(self.name, "r+") as f:
ret = f.readlines()
return ret
def writelines(self, lines):
with open(self.name, "w+") as f:
f.writelines(lines)
"""marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1
......@@ -168,6 +189,7 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT32,
"group": C_API_DTYPE_INT32}
def c_float_array(data):
"""get pointer of float numpy array / list"""
if is_1d_list(data):
......@@ -186,6 +208,7 @@ def c_float_array(data):
raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data)
def c_int_array(data):
"""get pointer of int numpy array / list"""
if is_1d_list(data):
......@@ -204,6 +227,7 @@ def c_int_array(data):
raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data)
class _InnerPredictor(object):
"""
A _InnerPredictor of LightGBM.
......@@ -255,7 +279,6 @@ class _InnerPredictor(object):
if self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def predict(self, data, num_iteration=-1,
raw_score=False, pred_leaf=False, data_has_header=False,
is_reshape=True):
......@@ -374,8 +397,7 @@ class _InnerPredictor(object):
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, mat.shape[0]
......@@ -405,8 +427,7 @@ class _InnerPredictor(object):
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, nrow
......@@ -436,17 +457,18 @@ class _InnerPredictor(object):
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, nrow
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data):
if isinstance(data, DataFrame):
data_dtypes = data.dtypes
......@@ -459,6 +481,7 @@ def _data_from_pandas(data):
data = data.values.astype('float')
return data
def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
......@@ -469,6 +492,7 @@ def _label_from_pandas(label):
label = label.values.astype('float')
return label
class _InnerDataset(object):
"""_InnerDataset used in LightGBM.
_InnerDataset is a internal data structure that used by LightGBM.
......@@ -536,8 +560,8 @@ class _InnerDataset(object):
elif isinstance(name, int):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature" \
.format(type(name).__name__, name))
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices)
......@@ -552,7 +576,7 @@ class _InnerDataset(object):
if is_str(data):
"""check data has header or not"""
if str(params.get("has_header", "")).lower() == "true" \
or str(params.get("header", "")).lower() == "true":
or str(params.get("header", "")).lower() == "true":
self.data_has_header = True
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile(
......@@ -927,6 +951,7 @@ class _InnerDataset(object):
ctypes.byref(ret)))
return ret.value
class Dataset(object):
"""High level Dataset used in LightGBM.
"""
......@@ -1140,7 +1165,6 @@ class Dataset(object):
"""
self._get_inner_dataset().save_binary(filename)
def set_label(self, label):
"""
Set label of Dataset
......@@ -1273,6 +1297,7 @@ class Dataset(object):
else:
raise LightGBMError("Cannot call num_feature before construct, please call it explicitly")
class Booster(object):
""""A Booster of LightGBM.
"""
......@@ -1397,7 +1422,7 @@ class Booster(object):
Name of validation data
"""
if not isinstance(data, Dataset):
raise TypeError('valid data should be Dataset instance, met {}'.format(type(train_set).__name__))
raise TypeError('valid data should be Dataset instance, met {}'.format(type(data).__name__))
if data._predictor is not self.__init_predictor:
raise LightGBMError("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData(
......@@ -1578,8 +1603,8 @@ class Booster(object):
result: str
Evaluation result list.
"""
return [item for i in range(1, self.__num_dataset) \
for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)]
return [item for i in range(1, self.__num_dataset)
for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
def save_model(self, filename, num_iteration=-1):
"""
......@@ -1684,6 +1709,7 @@ class Booster(object):
raise KeyError("importance_type must be split or gain")
dump_model = self.dump_model()
ret = [0] * (dump_model["max_feature_idx"] + 1)
def dfs(root):
if "split_feature" in root:
if importance_type == 'split':
......@@ -1773,7 +1799,7 @@ class Booster(object):
"""Get name of evals"""
tmp_out_len = ctypes.c_int64(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
ctypes.byref(tmp_out_len),
......
......@@ -3,6 +3,7 @@
from __future__ import absolute_import
import collections
class EarlyStopException(Exception):
"""Exception of early stopping.
Parameters
......@@ -14,6 +15,7 @@ class EarlyStopException(Exception):
super(EarlyStopException, self).__init__()
self.best_iteration = best_iteration
# Callback environment used by callbacks
CallbackEnv = collections.namedtuple(
"LightGBMCallbackEnv",
......@@ -24,6 +26,7 @@ CallbackEnv = collections.namedtuple(
"end_iteration",
"evaluation_result_list"])
def _format_eval_result(value, show_stdv=True):
"""format metric string"""
if len(value) == 4:
......@@ -58,8 +61,9 @@ def print_evaluation(period=1, show_stdv=True):
if not env.evaluation_result_list or period <= 0:
return
if (env.iteration + 1) % period == 0:
result = '\t'.join([_format_eval_result(x, show_stdv) \
for x in env.evaluation_result_list])
result = '\t'.join(
[_format_eval_result(x, show_stdv) for x in env.evaluation_result_list]
)
print('[%d]\t%s' % (env.iteration + 1, result))
callback.order = 10
return callback
......@@ -152,6 +156,7 @@ def early_stopping(stopping_rounds, verbose=True):
best_score = {}
best_iter = {}
best_msg = {}
def init(env):
"""internal function"""
if not env.evaluation_result_list:
......@@ -178,8 +183,11 @@ def early_stopping(stopping_rounds, verbose=True):
best_score[i] = score
best_iter[i] = env.iteration
if verbose:
best_msg[i] = '[%d]\t%s' % (env.iteration + 1, \
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
best_msg[i] = '[%d]\t%s' % (
env.iteration + 1, '\t'.join(
[_format_eval_result(x) for x in env.evaluation_result_list]
)
)
else:
if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None:
......
......@@ -9,6 +9,7 @@ import numpy as np
from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str
from . import callback
def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None,
......@@ -121,7 +122,7 @@ def train(params, train_set, num_boost_round=100,
if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i])
else:
name_valid_sets.append('valid_'+str(i))
name_valid_sets.append('valid_' + str(i))
for valid_data in valid_sets:
valid_data._update_params(params)
"""process callbacks"""
......@@ -211,6 +212,7 @@ class CVBooster(object):
""""Evaluate the CVBooster for one iteration."""
return self.booster.eval_valid(feval)
try:
from sklearn.model_selection import StratifiedKFold
SKLEARN_StratifiedKFold = True
......@@ -221,6 +223,7 @@ except ImportError:
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True):
"""
Make an n-fold list of CVBooster from random indices.
......@@ -251,6 +254,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
ret.append(CVBooster(train_set, valid_set, tparam))
return ret
def _agg_cv_result(raw_results):
"""
Aggregate cross-validation results.
......@@ -263,6 +267,7 @@ def _agg_cv_result(raw_results):
cvmap[one_line[1]].append(one_line[2])
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
shuffle=True, metrics=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None,
......
......@@ -26,5 +26,5 @@ def find_lib_path():
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
if not lib_path:
dll_path = [os.path.realpath(p) for p in dll_path]
raise Exception('Cannot find lightgbm Library in following paths: '+','.join(dll_path))
raise Exception('Cannot find lightgbm Library in following paths: ' + ','.join(dll_path))
return lib_path
......@@ -25,6 +25,7 @@ except ImportError:
LGBMRegressorBase = object
LGBMLabelEncoder = None
def _objective_function_wrapper(func):
"""Decorate an objective function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
......@@ -62,7 +63,7 @@ def _objective_function_wrapper(func):
elif argc == 3:
grad, hess = func(labels, preds, dataset.get_group())
else:
raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" %(argc))
raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
"""weighted for objective"""
weight = dataset.get_weight()
if weight is not None:
......@@ -83,6 +84,7 @@ def _objective_function_wrapper(func):
return grad, hess
return inner
def _eval_function_wrapper(func):
"""Decorate an eval function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
......@@ -128,9 +130,10 @@ def _eval_function_wrapper(func):
elif argc == 4:
return func(labels, preds, dataset.get_weight(), dataset.get_group())
else:
raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" %(argc))
raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
return inner
class LGBMModel(LGBMModelBase):
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
......@@ -354,9 +357,9 @@ class LGBMModel(LGBMModelBase):
if hasattr(self, 'eval_at'):
params['ndcg_eval_at'] = self.eval_at
if self.fobj:
params['objective'] = 'None' # objective = nullptr for unknown objective
params['objective'] = 'None' # objective = nullptr for unknown objective
if 'label_gain' in params and params['label_gain'] is None:
del params['label_gain'] # use default of cli version
del params['label_gain'] # use default of cli version
if callable(eval_metric):
feval = _eval_function_wrapper(eval_metric)
......@@ -474,6 +477,7 @@ class LGBMModel(LGBMModelBase):
def feature_importance(self):
return self.feature_importance_
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
def fit(self, X, y,
......@@ -495,6 +499,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
callbacks=callbacks)
return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
......@@ -595,6 +600,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
raise LightGBMError('No classes found. Need to call fit beforehand.')
return self.n_classes
class LGBMRanker(LGBMModel):
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
......@@ -646,7 +652,7 @@ class LGBMRanker(LGBMModel):
elif len(eval_group) != len(eval_set):
raise ValueError("Length of eval_group should equal to eval_set")
elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))) \
or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
if eval_at is not None:
......
import sys
import os
# coding: utf-8
# pylint: skip-file
import ctypes
import collections
import os
import numpy as np
from scipy import sparse
def LoadDll():
if os.name == 'nt':
lib_path = '../../windows/x64/DLL/lib_lightgbm.dll'
......@@ -14,6 +15,7 @@ def LoadDll():
lib = ctypes.cdll.LoadLibrary(lib_path)
return lib
LIB = LoadDll()
LIB.LGBM_GetLastError.restype = ctypes.c_char_p
......@@ -27,25 +29,29 @@ dtype_int64 = 3
def c_array(ctype, values):
return (ctype * len(values))(*values)
def c_str(string):
return ctypes.c_char_p(string.encode('ascii'))
def test_load_from_file(filename, reference):
ref = None
if reference != None:
if reference is not None:
ref = reference
handle = ctypes.c_void_p()
LIB.LGBM_DatasetCreateFromFile(c_str(filename),
c_str('max_bin=15'),
ref, ctypes.byref(handle) )
LIB.LGBM_DatasetCreateFromFile(
c_str(filename),
c_str('max_bin=15'),
ref, ctypes.byref(handle))
print(LIB.LGBM_GetLastError())
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) )
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) )
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle
def test_save_to_binary(handle, filename):
LIB.LGBM_DatasetSaveBinary(handle, c_str(filename))
......@@ -55,105 +61,113 @@ def test_load_from_csr(filename, reference):
label = []
inp = open(filename, 'r')
for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] )
label.append( float(line.split('\t')[0]) )
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
inp.close()
mat = np.array(data)
label = np.array(label, dtype=np.float32)
csr = sparse.csr_matrix(mat)
handle = ctypes.c_void_p()
ref = None
if reference != None:
if reference is not None:
ref = reference
LIB.LGBM_DatasetCreateFromCSR(c_array(ctypes.c_int, csr.indptr),
dtype_int32,
c_array(ctypes.c_int, csr.indices),
LIB.LGBM_DatasetCreateFromCSR(
c_array(ctypes.c_int, csr.indptr),
dtype_int32,
c_array(ctypes.c_int, csr.indices),
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64,
len(csr.indptr),
dtype_float64,
len(csr.indptr),
len(csr.data),
csr.shape[1],
c_str('max_bin=15'),
ref,
ctypes.byref(handle) )
csr.shape[1],
c_str('max_bin=15'),
ref,
ctypes.byref(handle))
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) )
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) )
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle
def test_load_from_csc(filename, reference):
data = []
label = []
inp = open(filename, 'r')
for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] )
label.append( float(line.split('\t')[0]) )
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
inp.close()
mat = np.array(data)
label = np.array(label, dtype=np.float32)
csr = sparse.csc_matrix(mat)
handle = ctypes.c_void_p()
ref = None
if reference != None:
if reference is not None:
ref = reference
LIB.LGBM_DatasetCreateFromCSC(c_array(ctypes.c_int, csr.indptr),
dtype_int32,
c_array(ctypes.c_int, csr.indices),
LIB.LGBM_DatasetCreateFromCSC(
c_array(ctypes.c_int, csr.indptr),
dtype_int32,
c_array(ctypes.c_int, csr.indices),
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64,
len(csr.indptr),
dtype_float64,
len(csr.indptr),
len(csr.data),
csr.shape[0],
c_str('max_bin=15'),
ref,
ctypes.byref(handle) )
csr.shape[0],
c_str('max_bin=15'),
ref,
ctypes.byref(handle))
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) )
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) )
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle
def test_load_from_mat(filename, reference):
data = []
label = []
inp = open(filename, 'r')
for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] )
label.append( float(line.split('\t')[0]) )
data.append([float(x) for x in line.split('\t')[1:]])
label.append(float(line.split('\t')[0]))
inp.close()
mat = np.array(data)
data = np.array(mat.reshape(mat.size), copy=False)
label = np.array(label, dtype=np.float32)
handle = ctypes.c_void_p()
ref = None
if reference != None:
if reference is not None:
ref = reference
LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(
ctypes.POINTER(ctypes.c_void_p)),
dtype_float64,
mat.shape[0],
mat.shape[1],
1,
c_str('max_bin=15'),
ref,
ctypes.byref(handle) )
c_str('max_bin=15'),
ref,
ctypes.byref(handle))
num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) )
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) )
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) )
print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle
def test_free_dataset(handle):
LIB.LGBM_DatasetFree(handle)
def test_dataset():
train = test_load_from_file('../../examples/binary_classification/binary.train', None)
test = test_load_from_mat('../../examples/binary_classification/binary.test', train)
......@@ -164,8 +178,10 @@ def test_dataset():
test_free_dataset(test)
test_save_to_binary(train, 'train.binary.bin')
test_free_dataset(train)
train = test_load_from_file('train.binary.bin', None)
train = test_load_from_file('train.binary.bin', None)
test_free_dataset(train)
def test_booster():
train = test_load_from_mat('../../examples/binary_classification/binary.train', None)
test = test_load_from_mat('../../examples/binary_classification/binary.test', train)
......@@ -174,11 +190,11 @@ def test_booster():
LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0)
for i in range(100):
LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished))
LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
print ('%d Iteration test AUC %f' %(i, result[0]))
print('%d Iteration test AUC %f' % (i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster)
test_free_dataset(train)
......@@ -189,14 +205,15 @@ def test_booster():
data = []
inp = open('../../examples/binary_classification/binary.test', 'r')
for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] )
data.append([float(x) for x in line.split('\t')[1:]])
inp.close()
mat = np.array(data)
preb = np.zeros(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_long()
data = np.array(mat.reshape(mat.size), copy=False)
LIB.LGBM_BoosterPredictForMat(booster2,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
LIB.LGBM_BoosterPredictForMat(
booster2,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64,
mat.shape[0],
mat.shape[1],
......@@ -205,8 +222,9 @@ def test_booster():
50,
ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterPredictForFile(booster2,c_str('../../examples/binary_classification/binary.test'),0 , 0, 50, c_str('preb.txt'))
LIB.LGBM_BoosterPredictForFile(booster2, c_str('../../examples/binary_classification/binary.test'), 0, 0, 50, c_str('preb.txt'))
LIB.LGBM_BoosterFree(booster2)
test_dataset()
test_booster()
# coding: utf-8
# pylint: skip-file
import unittest, tempfile, os
import os
import tempfile
import unittest
import lightgbm as lgb
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
class TestBasic(unittest.TestCase):
......@@ -14,11 +18,11 @@ class TestBasic(unittest.TestCase):
valid_data = train_data.create_valid(X_test, label=y_test)
params = {
"objective" : "binary",
"metric" : "auc",
"min_data" : 1,
"num_leaves" : 15,
"verbose" : -1
"objective": "binary",
"metric": "auc",
"min_data": 1,
"num_leaves": 15,
"verbose": -1
}
bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1")
......@@ -38,13 +42,14 @@ class TestBasic(unittest.TestCase):
self.assertEqual(len(pred_from_matr), len(pred_from_file))
for preds in zip(pred_from_matr, pred_from_file):
self.assertAlmostEqual(*preds, places=15)
#check saved model persistence
# check saved model persistence
bst = lgb.Booster(params, model_file="model.txt")
pred_from_model_file = bst.predict(X_test)
self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
for preds in zip(pred_from_matr, pred_from_model_file):
self.assertAlmostEqual(*preds, places=15)
print("----------------------------------------------------------------------")
print("running test_basic.py")
unittest.main()
# coding: utf-8
# pylint: skip-file
import os, unittest, math, copy
import numpy as np
import copy
import math
import os
import unittest
import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris
import numpy as np
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris)
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
try:
import cPickle as pickle
except:
import pickle
def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
def test_template(params={'objective': 'regression', 'metric': 'l2'},
X_y=load_boston(True), feval=mean_squared_error,
num_round=100, init_model=None, custom_eval=None,
early_stopping_rounds=10,
......@@ -23,7 +31,8 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data: return lgb_train, lgb_eval
if return_data:
return lgb_train, lgb_eval
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=num_round,
......@@ -34,16 +43,19 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
evals_result=evals_result,
early_stopping_rounds=early_stopping_rounds,
init_model=init_model)
if return_model: return gbm
else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
if return_model:
return gbm
else:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestEngine(unittest.TestCase):
def test_binary(self):
X_y= load_breast_cancer(True)
X_y = load_breast_cancer(True)
params = {
'objective' : 'binary',
'metric' : 'binary_logloss'
'objective': 'binary',
'metric': 'binary_logloss'
}
evals_result, ret = test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15)
......@@ -58,9 +70,9 @@ class TestEngine(unittest.TestCase):
def test_multiclass(self):
X_y = load_digits(10, True)
params = {
'objective' : 'multiclass',
'metric' : 'multi_logloss',
'num_class' : 10
'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class': 10
}
evals_result, ret = test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2)
......@@ -68,8 +80,8 @@ class TestEngine(unittest.TestCase):
def test_continue_train_and_other(self):
params = {
'objective' : 'regression',
'metric' : 'l1'
'objective': 'regression',
'metric': 'l1'
}
model_name = 'model.txt'
gbm = test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1)
......@@ -88,19 +100,19 @@ class TestEngine(unittest.TestCase):
def test_continue_train_multiclass(self):
X_y = load_iris(True)
params = {
'objective' : 'multiclass',
'metric' : 'multi_logloss',
'num_class' : 3
'objective': 'multiclass',
'metric': 'multi_logloss',
'num_class': 3
}
gbm = test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1)
evals_result, ret = test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm)
num_round=80, init_model=gbm)
self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self):
lgb_train, _ = test_template(return_data=True)
lgb.cv({'verbose':0}, lgb_train, num_boost_round=20, nfold=5,
lgb.cv({'verbose': 0}, lgb_train, num_boost_round=20, nfold=5,
metrics='l1', verbose_eval=False)
def test_save_load_copy_pickle(self):
......@@ -123,6 +135,7 @@ class TestEngine(unittest.TestCase):
for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5)
print("----------------------------------------------------------------------")
print("running test_engine.py")
unittest.main()
# coding: utf-8
# pylint: skip-file
import os, unittest
import numpy as np
import unittest
import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_svmlight_file
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.base import clone
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_svmlight_file)
from sklearn.externals import joblib
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
feval=mean_squared_error, num_round=100,
custom_obj=None, predict_proba=False,
return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data: return X_train, X_test, y_train, y_test
arguments = {'n_estimators' : num_round, 'silent' : True}
if custom_obj: arguments['objective'] = custom_obj
if return_data:
return X_train, X_test, y_train, y_test
arguments = {'n_estimators': num_round, 'silent': True}
if custom_obj:
arguments['objective'] = custom_obj
gbm = model(**arguments)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model: return gbm
else: return feval(y_test, gbm.predict_proba(X_test) if predict_proba else gbm.predict(X_test))
if return_model:
return gbm
elif predict_proba:
return feval(y_test, gbm.predict_proba(X_test))
else:
return feval(y_test, gbm.predict(X_test))
class TestSklearn(unittest.TestCase):
def test_binary(self):
X_y= load_breast_cancer(True)
X_y = load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15)
......@@ -34,6 +44,7 @@ class TestSklearn(unittest.TestCase):
def test_multiclass(self):
X_y = load_digits(10, True)
def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error)
......@@ -67,6 +78,7 @@ class TestSklearn(unittest.TestCase):
hess = y_pred * (1.0 - y_pred)
return grad, hess
X_y = load_digits(2, True)
def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
......@@ -81,7 +93,8 @@ class TestSklearn(unittest.TestCase):
def test_grid_search(self):
X_train, X_test, y_train, y_test = test_template(return_data=True)
params = {'boosting_type': ['dart', 'gbdt'],
'n_estimators': [15, 20], 'drop_rate':[0.1, 0.2]}
'n_estimators': [15, 20],
'drop_rate': [0.1, 0.2]}
gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
gbm.fit(X_train, y_train)
self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
......@@ -114,6 +127,7 @@ class TestSklearn(unittest.TestCase):
for preds in zip(pred_origin, pred_pickle):
self.assertAlmostEqual(*preds, places=5)
print("----------------------------------------------------------------------")
print("running test_sklearn.py")
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment