"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "aee92f63ba124e1f6a3168eb2864d032567cbf9e"
Commit dd425973 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

python code style with pep8 (#161)

* format python code with pep8

* **DO NOT MERGE** deliberately break rules to see what will happen during check

* Revert "**DO NOT MERGE** deliberately break rules to see what will happen during check"

This reverts commit 0db93cd7a43c7efa43a2112ada43d46c6f9115d9.

* fix format in test.py

* add docs for pep-8
parent c4778e73
...@@ -13,8 +13,9 @@ before_install: ...@@ -13,8 +13,9 @@ before_install:
- conda update -q conda - conda update -q conda
install: install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn - conda install --yes atlas numpy scipy scikit-learn
- pip install pep8
script: script:
...@@ -23,9 +24,9 @@ script: ...@@ -23,9 +24,9 @@ script:
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 .
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py
......
...@@ -27,15 +27,15 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, ...@@ -27,15 +27,15 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
'boosting_type' : 'gbdt', 'boosting_type': 'gbdt',
'objective' : 'binary', 'objective': 'binary',
'metric' : 'binary_logloss', 'metric': 'binary_logloss',
'num_leaves' : 31, 'num_leaves': 31,
'learning_rate' : 0.05, 'learning_rate': 0.05,
'feature_fraction' : 0.9, 'feature_fraction': 0.9,
'bagging_fraction' : 0.8, 'bagging_fraction': 0.8,
'bagging_freq': 5, 'bagging_freq': 5,
'verbose' : 0 'verbose': 0
} }
# generate a feature name # generate a feature name
...@@ -46,7 +46,7 @@ print('Start training...') ...@@ -46,7 +46,7 @@ print('Start training...')
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=10, num_boost_round=10,
valid_sets=lgb_train, # eval training data valid_sets=lgb_train, # eval training data
feature_name=feature_name, feature_name=feature_name,
categorical_feature=[21]) categorical_feature=[21])
...@@ -88,10 +88,11 @@ gbm = lgb.train(params, ...@@ -88,10 +88,11 @@ gbm = lgb.train(params,
num_boost_round=10, num_boost_round=10,
init_model=gbm, init_model=gbm,
valid_sets=lgb_eval, valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7]*5+[0.6]*5)]) callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finish 30 - 40 rounds with changing bagging_fraction...') print('Finish 30 - 40 rounds with changing bagging_fraction...')
# self-defined objective function # self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array # f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss # log likelihood loss
...@@ -102,6 +103,7 @@ def loglikelood(preds, train_data): ...@@ -102,6 +103,7 @@ def loglikelood(preds, train_data):
hess = preds * (1. - preds) hess = preds * (1. - preds)
return grad, hess return grad, hess
# self-defined eval metric # self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool # f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error # binary error
...@@ -109,6 +111,7 @@ def binary_error(preds, train_data): ...@@ -109,6 +111,7 @@ def binary_error(preds, train_data):
labels = train_data.get_label() labels = train_data.get_label()
return 'error', np.mean(labels != (preds > 0.5)), False return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=10, num_boost_round=10,
...@@ -120,6 +123,8 @@ gbm = lgb.train(params, ...@@ -120,6 +123,8 @@ gbm = lgb.train(params,
print('Finish 40 - 50 rounds with self-defined objective function and eval metric...') print('Finish 40 - 50 rounds with self-defined objective function and eval metric...')
print('Start a new training job...') print('Start a new training job...')
# callback # callback
def reset_metrics(): def reset_metrics():
def callback(env): def callback(env):
...@@ -131,6 +136,7 @@ def reset_metrics(): ...@@ -131,6 +136,7 @@ def reset_metrics():
callback.order = 0 callback.order = 0
return callback return callback
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=10, num_boost_round=10,
......
...@@ -21,16 +21,16 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) ...@@ -21,16 +21,16 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
'task' : 'train', 'task': 'train',
'boosting_type' : 'gbdt', 'boosting_type': 'gbdt',
'objective' : 'regression', 'objective': 'regression',
'metric' : {'l2', 'auc'}, 'metric': {'l2', 'auc'},
'num_leaves' : 31, 'num_leaves': 31,
'learning_rate' : 0.05, 'learning_rate': 0.05,
'feature_fraction' : 0.9, 'feature_fraction': 0.9,
'bagging_fraction' : 0.8, 'bagging_fraction': 0.8,
'bagging_freq': 5, 'bagging_freq': 5,
'verbose' : 0 'verbose': 0
} }
print('Start training...') print('Start training...')
......
...@@ -31,3 +31,19 @@ Troubleshooting ...@@ -31,3 +31,19 @@ Troubleshooting
setup.py directory, *never* absolute paths. setup.py directory, *never* absolute paths.
- **Solution 1**: please check `here <http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path>`__. - **Solution 1**: please check `here <http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path>`__.
Developments
--------
The code style of python package follows `pep-8 <https://www.python.org/dev/peps/pep-0008/>`__. If you would like to make a contribution and not familiar with pep-8, please check the pep-8 style guide first. Otherwise, you won't pass the check. You should be careful about:
- E1 Indentation (check pep-8 link above)
- E202 whitespace before and after brackets
- E225 missing whitespace around operator
- E226 missing whitespace around arithmetic operator
- E261 at least two spaces before inline comment
- E301 expected 1 blank line in front of and at the end of a method
- E302 expected 2 blank lines in front of and at the end of a function or a class
You can ignore E501 (line too long).
...@@ -6,8 +6,6 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors ...@@ -6,8 +6,6 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
from __future__ import absolute_import from __future__ import absolute_import
import os
from .basic import Dataset, Booster from .basic import Dataset, Booster
from .engine import train, cv from .engine import train, cv
from .callback import print_evaluation, record_evaluation, reset_parameter, early_stopping from .callback import print_evaluation, record_evaluation, reset_parameter, early_stopping
...@@ -23,4 +21,3 @@ __all__ = ['Dataset', 'Booster', ...@@ -23,4 +21,3 @@ __all__ = ['Dataset', 'Booster',
'train', 'cv', 'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping'] 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping']
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
"""Wrapper c_api of LightGBM""" """Wrapper c_api of LightGBM"""
from __future__ import absolute_import from __future__ import absolute_import
import sys
import ctypes import ctypes
import json import json
from tempfile import NamedTemporaryFile
import os import os
import sys
from tempfile import NamedTemporaryFile
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
...@@ -22,11 +22,13 @@ try: ...@@ -22,11 +22,13 @@ try:
except ImportError: except ImportError:
class Series(object): class Series(object):
pass pass
class DataFrame(object): class DataFrame(object):
pass pass
IS_PY3 = (sys.version_info[0] == 3) IS_PY3 = (sys.version_info[0] == 3)
def _load_lib(): def _load_lib():
"""Load LightGBM Library.""" """Load LightGBM Library."""
lib_path = find_lib_path() lib_path = find_lib_path()
...@@ -36,12 +38,15 @@ def _load_lib(): ...@@ -36,12 +38,15 @@ def _load_lib():
lib.LGBM_GetLastError.restype = ctypes.c_char_p lib.LGBM_GetLastError.restype = ctypes.c_char_p
return lib return lib
_LIB = _load_lib() _LIB = _load_lib()
class LightGBMError(Exception): class LightGBMError(Exception):
"""Error throwed by LightGBM""" """Error throwed by LightGBM"""
pass pass
def _safe_call(ret): def _safe_call(ret):
"""Check the return value of C API call """Check the return value of C API call
Parameters Parameters
...@@ -52,6 +57,7 @@ def _safe_call(ret): ...@@ -52,6 +57,7 @@ def _safe_call(ret):
if ret != 0: if ret != 0:
raise LightGBMError(_LIB.LGBM_GetLastError()) raise LightGBMError(_LIB.LGBM_GetLastError())
def is_str(s): def is_str(s):
"""Check is a str or not""" """Check is a str or not"""
if IS_PY3: if IS_PY3:
...@@ -59,6 +65,7 @@ def is_str(s): ...@@ -59,6 +65,7 @@ def is_str(s):
else: else:
return isinstance(s, basestring) return isinstance(s, basestring)
def is_numeric(obj): def is_numeric(obj):
"""Check is a number or not, include numpy number etc.""" """Check is a number or not, include numpy number etc."""
try: try:
...@@ -67,19 +74,23 @@ def is_numeric(obj): ...@@ -67,19 +74,23 @@ def is_numeric(obj):
except: except:
return False return False
def is_numpy_object(data): def is_numpy_object(data):
"""Check is numpy object""" """Check is numpy object"""
return type(data).__module__ == np.__name__ return type(data).__module__ == np.__name__
def is_numpy_1d_array(data): def is_numpy_1d_array(data):
"""Check is 1d numpy array""" """Check is 1d numpy array"""
return isinstance(data, np.ndarray) and len(data.shape) == 1 return isinstance(data, np.ndarray) and len(data.shape) == 1
def is_1d_list(data): def is_1d_list(data):
"""Check is 1d list""" """Check is 1d list"""
return isinstance(data, list) and \ return isinstance(data, list) and \
(not data or isinstance(data[0], (int, float, bool))) (not data or isinstance(data[0], (int, float, bool)))
def list_to_1d_numpy(data, dtype=np.float32, name='list'): def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""convert to 1d numpy array""" """convert to 1d numpy array"""
if is_numpy_1d_array(data): if is_numpy_1d_array(data):
...@@ -94,6 +105,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): ...@@ -94,6 +105,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
else: else:
raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name)) raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name))
def cfloat32_array_to_numpy(cptr, length): def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array. """Convert a ctypes float pointer array to a numpy array.
""" """
...@@ -102,6 +114,7 @@ def cfloat32_array_to_numpy(cptr, length): ...@@ -102,6 +114,7 @@ def cfloat32_array_to_numpy(cptr, length):
else: else:
raise RuntimeError('Expected float pointer') raise RuntimeError('Expected float pointer')
def cint32_array_to_numpy(cptr, length): def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array. """Convert a ctypes float pointer array to a numpy array.
""" """
...@@ -110,44 +123,52 @@ def cint32_array_to_numpy(cptr, length): ...@@ -110,44 +123,52 @@ def cint32_array_to_numpy(cptr, length):
else: else:
raise RuntimeError('Expected int pointer') raise RuntimeError('Expected int pointer')
def c_str(string): def c_str(string):
"""Convert a python string to cstring.""" """Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8')) return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values): def c_array(ctype, values):
"""Convert a python array to c array.""" """Convert a python array to c array."""
return (ctype * len(values))(*values) return (ctype * len(values))(*values)
def param_dict_to_str(data): def param_dict_to_str(data):
if data is None or not data: if data is None or not data:
return "" return ""
pairs = [] pairs = []
for key, val in data.items(): for key, val in data.items():
if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val): if isinstance(val, (list, tuple, set)) or is_numpy_1d_array(val):
pairs.append(str(key)+'='+','.join(map(str, val))) pairs.append(str(key) + '=' + ','.join(map(str, val)))
elif is_str(val) or isinstance(val, (int, float, bool)) or is_numeric(val): elif is_str(val) or isinstance(val, (int, float, bool)) or is_numeric(val):
pairs.append(str(key)+'='+str(val)) pairs.append(str(key) + '=' + str(val))
else: else:
raise TypeError('Unknown type of parameter:%s, got:%s' raise TypeError('Unknown type of parameter:%s, got:%s'
% (key, type(val).__name__)) % (key, type(val).__name__))
return ' '.join(pairs) return ' '.join(pairs)
class _temp_file(object): class _temp_file(object):
def __enter__(self): def __enter__(self):
with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f: with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
self.name = f.name self.name = f.name
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
if os.path.isfile(self.name): if os.path.isfile(self.name):
os.remove(self.name) os.remove(self.name)
def readlines(self): def readlines(self):
with open(self.name, "r+") as f: with open(self.name, "r+") as f:
ret = f.readlines() ret = f.readlines()
return ret return ret
def writelines(self, lines): def writelines(self, lines):
with open(self.name, "w+") as f: with open(self.name, "w+") as f:
f.writelines(lines) f.writelines(lines)
"""marco definition of data type in c_api of LightGBM""" """marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0 C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1 C_API_DTYPE_FLOAT64 = 1
...@@ -168,6 +189,7 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32, ...@@ -168,6 +189,7 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT32, "init_score": C_API_DTYPE_FLOAT32,
"group": C_API_DTYPE_INT32} "group": C_API_DTYPE_INT32}
def c_float_array(data): def c_float_array(data):
"""get pointer of float numpy array / list""" """get pointer of float numpy array / list"""
if is_1d_list(data): if is_1d_list(data):
...@@ -186,6 +208,7 @@ def c_float_array(data): ...@@ -186,6 +208,7 @@ def c_float_array(data):
raise TypeError("Unknown type({})".format(type(data).__name__)) raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data) return (ptr_data, type_data)
def c_int_array(data): def c_int_array(data):
"""get pointer of int numpy array / list""" """get pointer of int numpy array / list"""
if is_1d_list(data): if is_1d_list(data):
...@@ -204,6 +227,7 @@ def c_int_array(data): ...@@ -204,6 +227,7 @@ def c_int_array(data):
raise TypeError("Unknown type({})".format(type(data).__name__)) raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data) return (ptr_data, type_data)
class _InnerPredictor(object): class _InnerPredictor(object):
""" """
A _InnerPredictor of LightGBM. A _InnerPredictor of LightGBM.
...@@ -255,7 +279,6 @@ class _InnerPredictor(object): ...@@ -255,7 +279,6 @@ class _InnerPredictor(object):
if self.__is_manage_handle: if self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle)) _safe_call(_LIB.LGBM_BoosterFree(self.handle))
def predict(self, data, num_iteration=-1, def predict(self, data, num_iteration=-1,
raw_score=False, pred_leaf=False, data_has_header=False, raw_score=False, pred_leaf=False, data_has_header=False,
is_reshape=True): is_reshape=True):
...@@ -374,8 +397,7 @@ class _InnerPredictor(object): ...@@ -374,8 +397,7 @@ class _InnerPredictor(object):
predict_type, predict_type,
num_iteration, num_iteration,
ctypes.byref(out_num_preds), ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results") raise ValueError("Wrong length for predict results")
return preds, mat.shape[0] return preds, mat.shape[0]
...@@ -405,8 +427,7 @@ class _InnerPredictor(object): ...@@ -405,8 +427,7 @@ class _InnerPredictor(object):
predict_type, predict_type,
num_iteration, num_iteration,
ctypes.byref(out_num_preds), ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results") raise ValueError("Wrong length for predict results")
return preds, nrow return preds, nrow
...@@ -436,17 +457,18 @@ class _InnerPredictor(object): ...@@ -436,17 +457,18 @@ class _InnerPredictor(object):
predict_type, predict_type,
num_iteration, num_iteration,
ctypes.byref(out_num_preds), ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results") raise ValueError("Wrong length for predict results")
return preds, nrow return preds, nrow
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float', 'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'int'} 'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data): def _data_from_pandas(data):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
data_dtypes = data.dtypes data_dtypes = data.dtypes
...@@ -459,6 +481,7 @@ def _data_from_pandas(data): ...@@ -459,6 +481,7 @@ def _data_from_pandas(data):
data = data.values.astype('float') data = data.values.astype('float')
return data return data
def _label_from_pandas(label): def _label_from_pandas(label):
if isinstance(label, DataFrame): if isinstance(label, DataFrame):
if len(label.columns) > 1: if len(label.columns) > 1:
...@@ -469,6 +492,7 @@ def _label_from_pandas(label): ...@@ -469,6 +492,7 @@ def _label_from_pandas(label):
label = label.values.astype('float') label = label.values.astype('float')
return label return label
class _InnerDataset(object): class _InnerDataset(object):
"""_InnerDataset used in LightGBM. """_InnerDataset used in LightGBM.
_InnerDataset is a internal data structure that used by LightGBM. _InnerDataset is a internal data structure that used by LightGBM.
...@@ -536,8 +560,8 @@ class _InnerDataset(object): ...@@ -536,8 +560,8 @@ class _InnerDataset(object):
elif isinstance(name, int): elif isinstance(name, int):
categorical_indices.add(name) categorical_indices.add(name)
else: else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature" \ raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name)) .format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices) params['categorical_column'] = sorted(categorical_indices)
...@@ -552,7 +576,7 @@ class _InnerDataset(object): ...@@ -552,7 +576,7 @@ class _InnerDataset(object):
if is_str(data): if is_str(data):
"""check data has header or not""" """check data has header or not"""
if str(params.get("has_header", "")).lower() == "true" \ if str(params.get("has_header", "")).lower() == "true" \
or str(params.get("header", "")).lower() == "true": or str(params.get("header", "")).lower() == "true":
self.data_has_header = True self.data_has_header = True
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile( _safe_call(_LIB.LGBM_DatasetCreateFromFile(
...@@ -927,6 +951,7 @@ class _InnerDataset(object): ...@@ -927,6 +951,7 @@ class _InnerDataset(object):
ctypes.byref(ret))) ctypes.byref(ret)))
return ret.value return ret.value
class Dataset(object): class Dataset(object):
"""High level Dataset used in LightGBM. """High level Dataset used in LightGBM.
""" """
...@@ -1140,7 +1165,6 @@ class Dataset(object): ...@@ -1140,7 +1165,6 @@ class Dataset(object):
""" """
self._get_inner_dataset().save_binary(filename) self._get_inner_dataset().save_binary(filename)
def set_label(self, label): def set_label(self, label):
""" """
Set label of Dataset Set label of Dataset
...@@ -1273,6 +1297,7 @@ class Dataset(object): ...@@ -1273,6 +1297,7 @@ class Dataset(object):
else: else:
raise LightGBMError("Cannot call num_feature before construct, please call it explicitly") raise LightGBMError("Cannot call num_feature before construct, please call it explicitly")
class Booster(object): class Booster(object):
""""A Booster of LightGBM. """"A Booster of LightGBM.
""" """
...@@ -1397,7 +1422,7 @@ class Booster(object): ...@@ -1397,7 +1422,7 @@ class Booster(object):
Name of validation data Name of validation data
""" """
if not isinstance(data, Dataset): if not isinstance(data, Dataset):
raise TypeError('valid data should be Dataset instance, met {}'.format(type(train_set).__name__)) raise TypeError('valid data should be Dataset instance, met {}'.format(type(data).__name__))
if data._predictor is not self.__init_predictor: if data._predictor is not self.__init_predictor:
raise LightGBMError("Add validation data failed, you should use same predictor for these data") raise LightGBMError("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData( _safe_call(_LIB.LGBM_BoosterAddValidData(
...@@ -1578,8 +1603,8 @@ class Booster(object): ...@@ -1578,8 +1603,8 @@ class Booster(object):
result: str result: str
Evaluation result list. Evaluation result list.
""" """
return [item for i in range(1, self.__num_dataset) \ return [item for i in range(1, self.__num_dataset)
for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)] for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
def save_model(self, filename, num_iteration=-1): def save_model(self, filename, num_iteration=-1):
""" """
...@@ -1684,6 +1709,7 @@ class Booster(object): ...@@ -1684,6 +1709,7 @@ class Booster(object):
raise KeyError("importance_type must be split or gain") raise KeyError("importance_type must be split or gain")
dump_model = self.dump_model() dump_model = self.dump_model()
ret = [0] * (dump_model["max_feature_idx"] + 1) ret = [0] * (dump_model["max_feature_idx"] + 1)
def dfs(root): def dfs(root):
if "split_feature" in root: if "split_feature" in root:
if importance_type == 'split': if importance_type == 'split':
...@@ -1773,7 +1799,7 @@ class Booster(object): ...@@ -1773,7 +1799,7 @@ class Booster(object):
"""Get name of evals""" """Get name of evals"""
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)] string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers)) ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames( _safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle, self.handle,
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import collections import collections
class EarlyStopException(Exception): class EarlyStopException(Exception):
"""Exception of early stopping. """Exception of early stopping.
Parameters Parameters
...@@ -14,6 +15,7 @@ class EarlyStopException(Exception): ...@@ -14,6 +15,7 @@ class EarlyStopException(Exception):
super(EarlyStopException, self).__init__() super(EarlyStopException, self).__init__()
self.best_iteration = best_iteration self.best_iteration = best_iteration
# Callback environment used by callbacks # Callback environment used by callbacks
CallbackEnv = collections.namedtuple( CallbackEnv = collections.namedtuple(
"LightGBMCallbackEnv", "LightGBMCallbackEnv",
...@@ -24,6 +26,7 @@ CallbackEnv = collections.namedtuple( ...@@ -24,6 +26,7 @@ CallbackEnv = collections.namedtuple(
"end_iteration", "end_iteration",
"evaluation_result_list"]) "evaluation_result_list"])
def _format_eval_result(value, show_stdv=True): def _format_eval_result(value, show_stdv=True):
"""format metric string""" """format metric string"""
if len(value) == 4: if len(value) == 4:
...@@ -58,8 +61,9 @@ def print_evaluation(period=1, show_stdv=True): ...@@ -58,8 +61,9 @@ def print_evaluation(period=1, show_stdv=True):
if not env.evaluation_result_list or period <= 0: if not env.evaluation_result_list or period <= 0:
return return
if (env.iteration + 1) % period == 0: if (env.iteration + 1) % period == 0:
result = '\t'.join([_format_eval_result(x, show_stdv) \ result = '\t'.join(
for x in env.evaluation_result_list]) [_format_eval_result(x, show_stdv) for x in env.evaluation_result_list]
)
print('[%d]\t%s' % (env.iteration + 1, result)) print('[%d]\t%s' % (env.iteration + 1, result))
callback.order = 10 callback.order = 10
return callback return callback
...@@ -152,6 +156,7 @@ def early_stopping(stopping_rounds, verbose=True): ...@@ -152,6 +156,7 @@ def early_stopping(stopping_rounds, verbose=True):
best_score = {} best_score = {}
best_iter = {} best_iter = {}
best_msg = {} best_msg = {}
def init(env): def init(env):
"""internal function""" """internal function"""
if not env.evaluation_result_list: if not env.evaluation_result_list:
...@@ -178,8 +183,11 @@ def early_stopping(stopping_rounds, verbose=True): ...@@ -178,8 +183,11 @@ def early_stopping(stopping_rounds, verbose=True):
best_score[i] = score best_score[i] = score
best_iter[i] = env.iteration best_iter[i] = env.iteration
if verbose: if verbose:
best_msg[i] = '[%d]\t%s' % (env.iteration + 1, \ best_msg[i] = '[%d]\t%s' % (
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list])) env.iteration + 1, '\t'.join(
[_format_eval_result(x) for x in env.evaluation_result_list]
)
)
else: else:
if env.iteration - best_iter[i] >= stopping_rounds: if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None: if env.model is not None:
......
...@@ -9,6 +9,7 @@ import numpy as np ...@@ -9,6 +9,7 @@ import numpy as np
from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str from .basic import LightGBMError, _InnerPredictor, Dataset, Booster, is_str
from . import callback from . import callback
def train(params, train_set, num_boost_round=100, def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None, valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
...@@ -121,7 +122,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -121,7 +122,7 @@ def train(params, train_set, num_boost_round=100,
if valid_names is not None and len(valid_names) > i: if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i]) name_valid_sets.append(valid_names[i])
else: else:
name_valid_sets.append('valid_'+str(i)) name_valid_sets.append('valid_' + str(i))
for valid_data in valid_sets: for valid_data in valid_sets:
valid_data._update_params(params) valid_data._update_params(params)
"""process callbacks""" """process callbacks"""
...@@ -211,6 +212,7 @@ class CVBooster(object): ...@@ -211,6 +212,7 @@ class CVBooster(object):
""""Evaluate the CVBooster for one iteration.""" """"Evaluate the CVBooster for one iteration."""
return self.booster.eval_valid(feval) return self.booster.eval_valid(feval)
try: try:
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
SKLEARN_StratifiedKFold = True SKLEARN_StratifiedKFold = True
...@@ -221,6 +223,7 @@ except ImportError: ...@@ -221,6 +223,7 @@ except ImportError:
except ImportError: except ImportError:
SKLEARN_StratifiedKFold = False SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True): def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True):
""" """
Make an n-fold list of CVBooster from random indices. Make an n-fold list of CVBooster from random indices.
...@@ -251,6 +254,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals ...@@ -251,6 +254,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
ret.append(CVBooster(train_set, valid_set, tparam)) ret.append(CVBooster(train_set, valid_set, tparam))
return ret return ret
def _agg_cv_result(raw_results): def _agg_cv_result(raw_results):
""" """
Aggregate cross-validation results. Aggregate cross-validation results.
...@@ -263,6 +267,7 @@ def _agg_cv_result(raw_results): ...@@ -263,6 +267,7 @@ def _agg_cv_result(raw_results):
cvmap[one_line[1]].append(one_line[2]) cvmap[one_line[1]].append(one_line[2])
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
......
...@@ -26,5 +26,5 @@ def find_lib_path(): ...@@ -26,5 +26,5 @@ def find_lib_path():
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
if not lib_path: if not lib_path:
dll_path = [os.path.realpath(p) for p in dll_path] dll_path = [os.path.realpath(p) for p in dll_path]
raise Exception('Cannot find lightgbm Library in following paths: '+','.join(dll_path)) raise Exception('Cannot find lightgbm Library in following paths: ' + ','.join(dll_path))
return lib_path return lib_path
...@@ -25,6 +25,7 @@ except ImportError: ...@@ -25,6 +25,7 @@ except ImportError:
LGBMRegressorBase = object LGBMRegressorBase = object
LGBMLabelEncoder = None LGBMLabelEncoder = None
def _objective_function_wrapper(func): def _objective_function_wrapper(func):
"""Decorate an objective function """Decorate an objective function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
...@@ -62,7 +63,7 @@ def _objective_function_wrapper(func): ...@@ -62,7 +63,7 @@ def _objective_function_wrapper(func):
elif argc == 3: elif argc == 3:
grad, hess = func(labels, preds, dataset.get_group()) grad, hess = func(labels, preds, dataset.get_group())
else: else:
raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" %(argc)) raise TypeError("Self-defined objective function should have 2 or 3 arguments, got %d" % argc)
"""weighted for objective""" """weighted for objective"""
weight = dataset.get_weight() weight = dataset.get_weight()
if weight is not None: if weight is not None:
...@@ -83,6 +84,7 @@ def _objective_function_wrapper(func): ...@@ -83,6 +84,7 @@ def _objective_function_wrapper(func):
return grad, hess return grad, hess
return inner return inner
def _eval_function_wrapper(func): def _eval_function_wrapper(func):
"""Decorate an eval function """Decorate an eval function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
...@@ -128,9 +130,10 @@ def _eval_function_wrapper(func): ...@@ -128,9 +130,10 @@ def _eval_function_wrapper(func):
elif argc == 4: elif argc == 4:
return func(labels, preds, dataset.get_weight(), dataset.get_group()) return func(labels, preds, dataset.get_weight(), dataset.get_group())
else: else:
raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" %(argc)) raise TypeError("Self-defined eval function should have 2, 3 or 4 arguments, got %d" % argc)
return inner return inner
class LGBMModel(LGBMModelBase): class LGBMModel(LGBMModelBase):
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1, def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
...@@ -354,9 +357,9 @@ class LGBMModel(LGBMModelBase): ...@@ -354,9 +357,9 @@ class LGBMModel(LGBMModelBase):
if hasattr(self, 'eval_at'): if hasattr(self, 'eval_at'):
params['ndcg_eval_at'] = self.eval_at params['ndcg_eval_at'] = self.eval_at
if self.fobj: if self.fobj:
params['objective'] = 'None' # objective = nullptr for unknown objective params['objective'] = 'None' # objective = nullptr for unknown objective
if 'label_gain' in params and params['label_gain'] is None: if 'label_gain' in params and params['label_gain'] is None:
del params['label_gain'] # use default of cli version del params['label_gain'] # use default of cli version
if callable(eval_metric): if callable(eval_metric):
feval = _eval_function_wrapper(eval_metric) feval = _eval_function_wrapper(eval_metric)
...@@ -474,6 +477,7 @@ class LGBMModel(LGBMModelBase): ...@@ -474,6 +477,7 @@ class LGBMModel(LGBMModelBase):
def feature_importance(self): def feature_importance(self):
return self.feature_importance_ return self.feature_importance_
class LGBMRegressor(LGBMModel, LGBMRegressorBase): class LGBMRegressor(LGBMModel, LGBMRegressorBase):
def fit(self, X, y, def fit(self, X, y,
...@@ -495,6 +499,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -495,6 +499,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
callbacks=callbacks) callbacks=callbacks)
return self return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase): class LGBMClassifier(LGBMModel, LGBMClassifierBase):
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1, def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
...@@ -595,6 +600,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -595,6 +600,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
raise LightGBMError('No classes found. Need to call fit beforehand.') raise LightGBMError('No classes found. Need to call fit beforehand.')
return self.n_classes return self.n_classes
class LGBMRanker(LGBMModel): class LGBMRanker(LGBMModel):
def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1, def __init__(self, boosting_type="gbdt", num_leaves=31, max_depth=-1,
...@@ -646,7 +652,7 @@ class LGBMRanker(LGBMModel): ...@@ -646,7 +652,7 @@ class LGBMRanker(LGBMModel):
elif len(eval_group) != len(eval_set): elif len(eval_group) != len(eval_set):
raise ValueError("Length of eval_group should equal to eval_set") raise ValueError("Length of eval_group should equal to eval_set")
elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))) \ elif (isinstance(eval_group, dict) and any(i not in eval_group or eval_group[i] is None for i in range(len(eval_group)))) \
or (isinstance(eval_group, list) and any(group is None for group in eval_group)): or (isinstance(eval_group, list) and any(group is None for group in eval_group)):
raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0") raise ValueError("Should set group for all eval dataset for ranking task; if you use dict, the index should start from 0")
if eval_at is not None: if eval_at is not None:
......
import sys # coding: utf-8
import os # pylint: skip-file
import ctypes import ctypes
import collections import os
import numpy as np import numpy as np
from scipy import sparse from scipy import sparse
def LoadDll(): def LoadDll():
if os.name == 'nt': if os.name == 'nt':
lib_path = '../../windows/x64/DLL/lib_lightgbm.dll' lib_path = '../../windows/x64/DLL/lib_lightgbm.dll'
...@@ -14,6 +15,7 @@ def LoadDll(): ...@@ -14,6 +15,7 @@ def LoadDll():
lib = ctypes.cdll.LoadLibrary(lib_path) lib = ctypes.cdll.LoadLibrary(lib_path)
return lib return lib
LIB = LoadDll() LIB = LoadDll()
LIB.LGBM_GetLastError.restype = ctypes.c_char_p LIB.LGBM_GetLastError.restype = ctypes.c_char_p
...@@ -27,25 +29,29 @@ dtype_int64 = 3 ...@@ -27,25 +29,29 @@ dtype_int64 = 3
def c_array(ctype, values): def c_array(ctype, values):
return (ctype * len(values))(*values) return (ctype * len(values))(*values)
def c_str(string): def c_str(string):
return ctypes.c_char_p(string.encode('ascii')) return ctypes.c_char_p(string.encode('ascii'))
def test_load_from_file(filename, reference): def test_load_from_file(filename, reference):
ref = None ref = None
if reference != None: if reference is not None:
ref = reference ref = reference
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
LIB.LGBM_DatasetCreateFromFile(c_str(filename), LIB.LGBM_DatasetCreateFromFile(
c_str('max_bin=15'), c_str(filename),
ref, ctypes.byref(handle) ) c_str('max_bin=15'),
ref, ctypes.byref(handle))
print(LIB.LGBM_GetLastError()) print(LIB.LGBM_GetLastError())
num_data = ctypes.c_long() num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) ) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) ) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) ) print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle return handle
def test_save_to_binary(handle, filename): def test_save_to_binary(handle, filename):
LIB.LGBM_DatasetSaveBinary(handle, c_str(filename)) LIB.LGBM_DatasetSaveBinary(handle, c_str(filename))
...@@ -55,105 +61,113 @@ def test_load_from_csr(filename, reference): ...@@ -55,105 +61,113 @@ def test_load_from_csr(filename, reference):
label = [] label = []
inp = open(filename, 'r') inp = open(filename, 'r')
for line in inp.readlines(): for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] ) data.append([float(x) for x in line.split('\t')[1:]])
label.append( float(line.split('\t')[0]) ) label.append(float(line.split('\t')[0]))
inp.close() inp.close()
mat = np.array(data) mat = np.array(data)
label = np.array(label, dtype=np.float32) label = np.array(label, dtype=np.float32)
csr = sparse.csr_matrix(mat) csr = sparse.csr_matrix(mat)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
ref = None ref = None
if reference != None: if reference is not None:
ref = reference ref = reference
LIB.LGBM_DatasetCreateFromCSR(c_array(ctypes.c_int, csr.indptr), LIB.LGBM_DatasetCreateFromCSR(
dtype_int32, c_array(ctypes.c_int, csr.indptr),
c_array(ctypes.c_int, csr.indices), dtype_int32,
c_array(ctypes.c_int, csr.indices),
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64, dtype_float64,
len(csr.indptr), len(csr.indptr),
len(csr.data), len(csr.data),
csr.shape[1], csr.shape[1],
c_str('max_bin=15'), c_str('max_bin=15'),
ref, ref,
ctypes.byref(handle) ) ctypes.byref(handle))
num_data = ctypes.c_long() num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) ) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) ) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0) LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) ) print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle return handle
def test_load_from_csc(filename, reference): def test_load_from_csc(filename, reference):
data = [] data = []
label = [] label = []
inp = open(filename, 'r') inp = open(filename, 'r')
for line in inp.readlines(): for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] ) data.append([float(x) for x in line.split('\t')[1:]])
label.append( float(line.split('\t')[0]) ) label.append(float(line.split('\t')[0]))
inp.close() inp.close()
mat = np.array(data) mat = np.array(data)
label = np.array(label, dtype=np.float32) label = np.array(label, dtype=np.float32)
csr = sparse.csc_matrix(mat) csr = sparse.csc_matrix(mat)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
ref = None ref = None
if reference != None: if reference is not None:
ref = reference ref = reference
LIB.LGBM_DatasetCreateFromCSC(c_array(ctypes.c_int, csr.indptr), LIB.LGBM_DatasetCreateFromCSC(
dtype_int32, c_array(ctypes.c_int, csr.indptr),
c_array(ctypes.c_int, csr.indices), dtype_int32,
c_array(ctypes.c_int, csr.indices),
csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64, dtype_float64,
len(csr.indptr), len(csr.indptr),
len(csr.data), len(csr.data),
csr.shape[0], csr.shape[0],
c_str('max_bin=15'), c_str('max_bin=15'),
ref, ref,
ctypes.byref(handle) ) ctypes.byref(handle))
num_data = ctypes.c_long() num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) ) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) ) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0) LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) ) print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle return handle
def test_load_from_mat(filename, reference): def test_load_from_mat(filename, reference):
data = [] data = []
label = [] label = []
inp = open(filename, 'r') inp = open(filename, 'r')
for line in inp.readlines(): for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] ) data.append([float(x) for x in line.split('\t')[1:]])
label.append( float(line.split('\t')[0]) ) label.append(float(line.split('\t')[0]))
inp.close() inp.close()
mat = np.array(data) mat = np.array(data)
data = np.array(mat.reshape(mat.size), copy=False) data = np.array(mat.reshape(mat.size), copy=False)
label = np.array(label, dtype=np.float32) label = np.array(label, dtype=np.float32)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
ref = None ref = None
if reference != None: if reference is not None:
ref = reference ref = reference
LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), LIB.LGBM_DatasetCreateFromMat(data.ctypes.data_as(
ctypes.POINTER(ctypes.c_void_p)),
dtype_float64, dtype_float64,
mat.shape[0], mat.shape[0],
mat.shape[1], mat.shape[1],
1, 1,
c_str('max_bin=15'), c_str('max_bin=15'),
ref, ref,
ctypes.byref(handle) ) ctypes.byref(handle))
num_data = ctypes.c_long() num_data = ctypes.c_long()
LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data) ) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data))
num_feature = ctypes.c_long() num_feature = ctypes.c_long()
LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature) ) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature))
LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0) LIB.LGBM_DatasetSetField(handle, c_str('label'), c_array(ctypes.c_float, label), len(label), 0)
print ('#data:%d #feature:%d' %(num_data.value, num_feature.value) ) print('#data:%d #feature:%d' % (num_data.value, num_feature.value))
return handle return handle
def test_free_dataset(handle): def test_free_dataset(handle):
LIB.LGBM_DatasetFree(handle) LIB.LGBM_DatasetFree(handle)
def test_dataset(): def test_dataset():
train = test_load_from_file('../../examples/binary_classification/binary.train', None) train = test_load_from_file('../../examples/binary_classification/binary.train', None)
test = test_load_from_mat('../../examples/binary_classification/binary.test', train) test = test_load_from_mat('../../examples/binary_classification/binary.test', train)
...@@ -164,8 +178,10 @@ def test_dataset(): ...@@ -164,8 +178,10 @@ def test_dataset():
test_free_dataset(test) test_free_dataset(test)
test_save_to_binary(train, 'train.binary.bin') test_save_to_binary(train, 'train.binary.bin')
test_free_dataset(train) test_free_dataset(train)
train = test_load_from_file('train.binary.bin', None) train = test_load_from_file('train.binary.bin', None)
test_free_dataset(train) test_free_dataset(train)
def test_booster(): def test_booster():
train = test_load_from_mat('../../examples/binary_classification/binary.train', None) train = test_load_from_mat('../../examples/binary_classification/binary.train', None)
test = test_load_from_mat('../../examples/binary_classification/binary.test', train) test = test_load_from_mat('../../examples/binary_classification/binary.test', train)
...@@ -174,11 +190,11 @@ def test_booster(): ...@@ -174,11 +190,11 @@ def test_booster():
LIB.LGBM_BoosterAddValidData(booster, test) LIB.LGBM_BoosterAddValidData(booster, test)
is_finished = ctypes.c_int(0) is_finished = ctypes.c_int(0)
for i in range(100): for i in range(100):
LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished)) LIB.LGBM_BoosterUpdateOneIter(booster, ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float64) result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0) out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))) LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
print ('%d Iteration test AUC %f' %(i, result[0])) print('%d Iteration test AUC %f' % (i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt')) LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster) LIB.LGBM_BoosterFree(booster)
test_free_dataset(train) test_free_dataset(train)
...@@ -189,14 +205,15 @@ def test_booster(): ...@@ -189,14 +205,15 @@ def test_booster():
data = [] data = []
inp = open('../../examples/binary_classification/binary.test', 'r') inp = open('../../examples/binary_classification/binary.test', 'r')
for line in inp.readlines(): for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] ) data.append([float(x) for x in line.split('\t')[1:]])
inp.close() inp.close()
mat = np.array(data) mat = np.array(data)
preb = np.zeros(mat.shape[0], dtype=np.float64) preb = np.zeros(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_long() num_preb = ctypes.c_long()
data = np.array(mat.reshape(mat.size), copy=False) data = np.array(mat.reshape(mat.size), copy=False)
LIB.LGBM_BoosterPredictForMat(booster2, LIB.LGBM_BoosterPredictForMat(
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), booster2,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
dtype_float64, dtype_float64,
mat.shape[0], mat.shape[0],
mat.shape[1], mat.shape[1],
...@@ -205,8 +222,9 @@ def test_booster(): ...@@ -205,8 +222,9 @@ def test_booster():
50, 50,
ctypes.byref(num_preb), ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double))) preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterPredictForFile(booster2,c_str('../../examples/binary_classification/binary.test'),0 , 0, 50, c_str('preb.txt')) LIB.LGBM_BoosterPredictForFile(booster2, c_str('../../examples/binary_classification/binary.test'), 0, 0, 50, c_str('preb.txt'))
LIB.LGBM_BoosterFree(booster2) LIB.LGBM_BoosterFree(booster2)
test_dataset() test_dataset()
test_booster() test_booster()
# coding: utf-8 # coding: utf-8
# pylint: skip-file # pylint: skip-file
import unittest, tempfile, os import os
import tempfile
import unittest
import lightgbm as lgb
import numpy as np import numpy as np
from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import lightgbm as lgb
class TestBasic(unittest.TestCase): class TestBasic(unittest.TestCase):
...@@ -14,11 +18,11 @@ class TestBasic(unittest.TestCase): ...@@ -14,11 +18,11 @@ class TestBasic(unittest.TestCase):
valid_data = train_data.create_valid(X_test, label=y_test) valid_data = train_data.create_valid(X_test, label=y_test)
params = { params = {
"objective" : "binary", "objective": "binary",
"metric" : "auc", "metric": "auc",
"min_data" : 1, "min_data": 1,
"num_leaves" : 15, "num_leaves": 15,
"verbose" : -1 "verbose": -1
} }
bst = lgb.Booster(params, train_data) bst = lgb.Booster(params, train_data)
bst.add_valid(valid_data, "valid_1") bst.add_valid(valid_data, "valid_1")
...@@ -38,13 +42,14 @@ class TestBasic(unittest.TestCase): ...@@ -38,13 +42,14 @@ class TestBasic(unittest.TestCase):
self.assertEqual(len(pred_from_matr), len(pred_from_file)) self.assertEqual(len(pred_from_matr), len(pred_from_file))
for preds in zip(pred_from_matr, pred_from_file): for preds in zip(pred_from_matr, pred_from_file):
self.assertAlmostEqual(*preds, places=15) self.assertAlmostEqual(*preds, places=15)
#check saved model persistence # check saved model persistence
bst = lgb.Booster(params, model_file="model.txt") bst = lgb.Booster(params, model_file="model.txt")
pred_from_model_file = bst.predict(X_test) pred_from_model_file = bst.predict(X_test)
self.assertEqual(len(pred_from_matr), len(pred_from_model_file)) self.assertEqual(len(pred_from_matr), len(pred_from_model_file))
for preds in zip(pred_from_matr, pred_from_model_file): for preds in zip(pred_from_matr, pred_from_model_file):
self.assertAlmostEqual(*preds, places=15) self.assertAlmostEqual(*preds, places=15)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_basic.py") print("running test_basic.py")
unittest.main() unittest.main()
# coding: utf-8 # coding: utf-8
# pylint: skip-file # pylint: skip-file
import os, unittest, math, copy import copy
import numpy as np import math
import os
import unittest
import lightgbm as lgb import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error, mean_absolute_error import numpy as np
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_iris from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris)
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
try: try:
import cPickle as pickle import cPickle as pickle
except: except:
import pickle import pickle
def multi_logloss(y_true, y_pred): def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
def test_template(params={'objective': 'regression', 'metric': 'l2'},
X_y=load_boston(True), feval=mean_squared_error, X_y=load_boston(True), feval=mean_squared_error,
num_round=100, init_model=None, custom_eval=None, num_round=100, init_model=None, custom_eval=None,
early_stopping_rounds=10, early_stopping_rounds=10,
...@@ -23,7 +31,8 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'}, ...@@ -23,7 +31,8 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data: return lgb_train, lgb_eval if return_data:
return lgb_train, lgb_eval
evals_result = {} evals_result = {}
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train,
num_boost_round=num_round, num_boost_round=num_round,
...@@ -34,16 +43,19 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'}, ...@@ -34,16 +43,19 @@ def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
evals_result=evals_result, evals_result=evals_result,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
init_model=init_model) init_model=init_model)
if return_model: return gbm if return_model:
else: return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration)) return gbm
else:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestEngine(unittest.TestCase): class TestEngine(unittest.TestCase):
def test_binary(self): def test_binary(self):
X_y= load_breast_cancer(True) X_y = load_breast_cancer(True)
params = { params = {
'objective' : 'binary', 'objective': 'binary',
'metric' : 'binary_logloss' 'metric': 'binary_logloss'
} }
evals_result, ret = test_template(params, X_y, log_loss) evals_result, ret = test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
...@@ -58,9 +70,9 @@ class TestEngine(unittest.TestCase): ...@@ -58,9 +70,9 @@ class TestEngine(unittest.TestCase):
def test_multiclass(self): def test_multiclass(self):
X_y = load_digits(10, True) X_y = load_digits(10, True)
params = { params = {
'objective' : 'multiclass', 'objective': 'multiclass',
'metric' : 'multi_logloss', 'metric': 'multi_logloss',
'num_class' : 10 'num_class': 10
} }
evals_result, ret = test_template(params, X_y, multi_logloss) evals_result, ret = test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
...@@ -68,8 +80,8 @@ class TestEngine(unittest.TestCase): ...@@ -68,8 +80,8 @@ class TestEngine(unittest.TestCase):
def test_continue_train_and_other(self): def test_continue_train_and_other(self):
params = { params = {
'objective' : 'regression', 'objective': 'regression',
'metric' : 'l1' 'metric': 'l1'
} }
model_name = 'model.txt' model_name = 'model.txt'
gbm = test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1) gbm = test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1)
...@@ -88,19 +100,19 @@ class TestEngine(unittest.TestCase): ...@@ -88,19 +100,19 @@ class TestEngine(unittest.TestCase):
def test_continue_train_multiclass(self): def test_continue_train_multiclass(self):
X_y = load_iris(True) X_y = load_iris(True)
params = { params = {
'objective' : 'multiclass', 'objective': 'multiclass',
'metric' : 'multi_logloss', 'metric': 'multi_logloss',
'num_class' : 3 'num_class': 3
} }
gbm = test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1) gbm = test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1)
evals_result, ret = test_template(params, X_y, feval=multi_logloss, evals_result, ret = test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm) num_round=80, init_model=gbm)
self.assertLess(ret, 1.5) self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self): def test_cv(self):
lgb_train, _ = test_template(return_data=True) lgb_train, _ = test_template(return_data=True)
lgb.cv({'verbose':0}, lgb_train, num_boost_round=20, nfold=5, lgb.cv({'verbose': 0}, lgb_train, num_boost_round=20, nfold=5,
metrics='l1', verbose_eval=False) metrics='l1', verbose_eval=False)
def test_save_load_copy_pickle(self): def test_save_load_copy_pickle(self):
...@@ -123,6 +135,7 @@ class TestEngine(unittest.TestCase): ...@@ -123,6 +135,7 @@ class TestEngine(unittest.TestCase):
for ret in other_ret: for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5) self.assertAlmostEqual(ret_origin, ret, places=5)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_engine.py") print("running test_engine.py")
unittest.main() unittest.main()
# coding: utf-8 # coding: utf-8
# pylint: skip-file # pylint: skip-file
import os, unittest import unittest
import numpy as np
import lightgbm as lgb import lightgbm as lgb
from sklearn.metrics import log_loss, mean_squared_error import numpy as np
from sklearn.datasets import load_breast_cancer, load_boston, load_digits, load_svmlight_file
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import clone from sklearn.base import clone
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_svmlight_file)
from sklearn.externals import joblib from sklearn.externals import joblib
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor, def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
feval=mean_squared_error, num_round=100, feval=mean_squared_error, num_round=100,
custom_obj=None, predict_proba=False, custom_obj=None, predict_proba=False,
return_data=False, return_model=False): return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data: return X_train, X_test, y_train, y_test if return_data:
arguments = {'n_estimators' : num_round, 'silent' : True} return X_train, X_test, y_train, y_test
if custom_obj: arguments['objective'] = custom_obj arguments = {'n_estimators': num_round, 'silent': True}
if custom_obj:
arguments['objective'] = custom_obj
gbm = model(**arguments) gbm = model(**arguments)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model: return gbm if return_model:
else: return feval(y_test, gbm.predict_proba(X_test) if predict_proba else gbm.predict(X_test)) return gbm
elif predict_proba:
return feval(y_test, gbm.predict_proba(X_test))
else:
return feval(y_test, gbm.predict(X_test))
class TestSklearn(unittest.TestCase): class TestSklearn(unittest.TestCase):
def test_binary(self): def test_binary(self):
X_y= load_breast_cancer(True) X_y = load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True) ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
...@@ -34,6 +44,7 @@ class TestSklearn(unittest.TestCase): ...@@ -34,6 +44,7 @@ class TestSklearn(unittest.TestCase):
def test_multiclass(self): def test_multiclass(self):
X_y = load_digits(10, True) X_y = load_digits(10, True)
def multi_error(y_true, y_pred): def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred) return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error) ret = test_template(X_y, lgb.LGBMClassifier, multi_error)
...@@ -67,6 +78,7 @@ class TestSklearn(unittest.TestCase): ...@@ -67,6 +78,7 @@ class TestSklearn(unittest.TestCase):
hess = y_pred * (1.0 - y_pred) hess = y_pred * (1.0 - y_pred)
return grad, hess return grad, hess
X_y = load_digits(2, True) X_y = load_digits(2, True)
def binary_error(y_test, y_pred): def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)]) return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj) ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
...@@ -81,7 +93,8 @@ class TestSklearn(unittest.TestCase): ...@@ -81,7 +93,8 @@ class TestSklearn(unittest.TestCase):
def test_grid_search(self): def test_grid_search(self):
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = test_template(return_data=True)
params = {'boosting_type': ['dart', 'gbdt'], params = {'boosting_type': ['dart', 'gbdt'],
'n_estimators': [15, 20], 'drop_rate':[0.1, 0.2]} 'n_estimators': [15, 20],
'drop_rate': [0.1, 0.2]}
gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3) gbm = GridSearchCV(lgb.LGBMRegressor(), params, cv=3)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
self.assertIn(gbm.best_params_['n_estimators'], [15, 20]) self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
...@@ -114,6 +127,7 @@ class TestSklearn(unittest.TestCase): ...@@ -114,6 +127,7 @@ class TestSklearn(unittest.TestCase):
for preds in zip(pred_origin, pred_pickle): for preds in zip(pred_origin, pred_pickle):
self.assertAlmostEqual(*preds, places=5) self.assertAlmostEqual(*preds, places=5)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_sklearn.py") print("running test_sklearn.py")
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment