Commit ebfc8521 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add an advanced example; add guide-python README.md details; clean error messages (#117)

parent b51c7be4
...@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example: ...@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example:
``` ```
python simple_example.py python simple_example.py
``` ```
Examples including:
- [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
- Construct Dataset
- Basic train and predict
- Eval during training
- Early stopping
- Save model to file
- Dump model to json format
- Feature importances
- [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
- Basic train and predict with sklearn interface
- Feature importances with sklearn interface
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Set feature names
- Directly use categorical features without one-hot encoding
- Load model file to continue training
- Change learning rates during training
- Self-defined objective function
- Self-defined eval metric
- Callback function
\ No newline at end of file
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
import numpy as np
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
# specify your configurations as a dict
params = {
'boosting_type' : 'gbdt',
'objective' : 'binary',
'metric' : 'binary_logloss',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
'verbose' : 0
}
# generate a feature name
feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
# check feature name
print('Finish first 10 rounds...')
print('7th feature name is:', repr(lgb_train.feature_name[6]))
# save model to file
gbm.save_model('model.txt')
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finish 10 - 20 rounds with model file...')
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
# 3. function(curr_iter, total_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('Finish 20 - 30 rounds with decay learning rates...')
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
return grad, hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def binary_error(preds, train_data):
labels = train_data.get_label()
return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelood,
feval=binary_error,
valid_sets=lgb_eval)
print('Finish 30 - 40 rounds with self-defined objective function and eval metric...')
print('Start a new training job...')
# callback
def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new valid')
callback.before_iteration = True
callback.order = 0
return callback
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
print('Finish first 10 rounds with callback function...')
...@@ -6,6 +6,7 @@ import pandas as pd ...@@ -6,6 +6,7 @@ import pandas as pd
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
# load or create your dataset # load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
...@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1) ...@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1)
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
'task' : 'train', 'task' : 'train',
...@@ -33,27 +33,32 @@ params = { ...@@ -33,27 +33,32 @@ params = {
'verbose' : 0 'verbose' : 0
} }
print('Start training...')
# train # train
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=100, num_boost_round=20,
valid_sets=lgb_eval, valid_sets=lgb_eval,
early_stopping_rounds=10) early_stopping_rounds=5)
print('Save model...')
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model('model.txt')
print('Start predicting...')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Dump model to JSON...')
# dump model to json (and save to file) # dump model to json (and save to file)
model_json = gbm.dump_model() model_json = gbm.dump_model()
with open('model.json', 'w+') as f: with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4) json.dump(model_json, f, indent=4)
print('Calculate feature importances...')
# feature importances # feature importances
print('Feature importances:', gbm.feature_importance()) print('Feature importances:', list(gbm.feature_importance()))
print('Feature importances:', gbm.feature_importance("gain")) # print('Feature importances:', list(gbm.feature_importance("gain")))
...@@ -5,6 +5,7 @@ import pandas as pd ...@@ -5,6 +5,7 @@ import pandas as pd
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
# load or create your dataset # load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
...@@ -13,19 +14,23 @@ y_test = df_test[0] ...@@ -13,19 +14,23 @@ y_test = df_test[0]
X_train = df_train.drop(0, axis=1) X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1) X_test = df_test.drop(0, axis=1)
print('Start training...')
# train # train
gbm = lgb.LGBMRegressor(objective='regression', gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31, num_leaves=31,
learning_rate=0.05, learning_rate=0.05,
n_estimators=100) n_estimators=20)
gbm.fit(X_train, y_train, gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)], eval_set=[(X_test, y_test)],
early_stopping_rounds=10) eval_metric='l1',
early_stopping_rounds=5)
print('Start predicting...')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Calculate feature importances...')
# feature importances # feature importances
print('Feature importances:', gbm.feature_importance()) print('Feature importances:', list(gbm.feature_importance()))
# coding: utf-8 # coding: utf-8
# pylint: disable = invalid-name, C0111, C0301, R0912, R0913, R0914, W0105 # pylint: disable = invalid-name, C0111, C0301
# pylint: disable = R0912, R0913, R0914, W0105, W0201, W0212
# pylint: disable = E1101 # pylint: disable = E1101
"""Wrapper c_api of LightGBM""" """Wrapper c_api of LightGBM"""
from __future__ import absolute_import from __future__ import absolute_import
...@@ -17,13 +18,11 @@ from .libpath import find_lib_path ...@@ -17,13 +18,11 @@ from .libpath import find_lib_path
"""pandas""" """pandas"""
try: try:
from pandas import Series, DataFrame from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True
except ImportError: except ImportError:
class Series(object): class Series(object):
pass pass
class DataFrame(object): class DataFrame(object):
pass pass
IS_PANDAS_INSTALLED = False
IS_PY3 = (sys.version_info[0] == 3) IS_PY3 = (sys.version_info[0] == 3)
...@@ -72,7 +71,7 @@ def is_1d_list(data): ...@@ -72,7 +71,7 @@ def is_1d_list(data):
return isinstance(data, list) and \ return isinstance(data, list) and \
(not data or isinstance(data[0], (int, float, bool))) (not data or isinstance(data[0], (int, float, bool)))
def list_to_1d_numpy(data, dtype): def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""convert to 1d numpy array""" """convert to 1d numpy array"""
if is_numpy_1d_array(data): if is_numpy_1d_array(data):
if data.dtype == dtype: if data.dtype == dtype:
...@@ -81,28 +80,26 @@ def list_to_1d_numpy(data, dtype): ...@@ -81,28 +80,26 @@ def list_to_1d_numpy(data, dtype):
return data.astype(dtype=dtype, copy=False) return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data): elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False) return np.array(data, dtype=dtype, copy=False)
elif IS_PANDAS_INSTALLED and isinstance(data, Series): elif isinstance(data, Series):
return data.astype(dtype).values return data.values.astype(dtype)
else: else:
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name))
def cfloat32_array_to_numpy(cptr, length): def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array. """Convert a ctypes float pointer array to a numpy array.
""" """
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)): if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
res = np.fromiter(cptr, dtype=np.float32, count=length) return np.fromiter(cptr, dtype=np.float32, count=length)
return res
else: else:
raise RuntimeError('expected float pointer') raise RuntimeError('Expected float pointer')
def cint32_array_to_numpy(cptr, length): def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array. """Convert a ctypes float pointer array to a numpy array.
""" """
if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)): if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
res = np.fromiter(cptr, dtype=np.int32, count=length) return np.fromiter(cptr, dtype=np.int32, count=length)
return res
else: else:
raise RuntimeError('expected int pointer') raise RuntimeError('Expected int pointer')
def c_str(string): def c_str(string):
"""Convert a python string to cstring.""" """Convert a python string to cstring."""
...@@ -113,7 +110,7 @@ def c_array(ctype, values): ...@@ -113,7 +110,7 @@ def c_array(ctype, values):
return (ctype * len(values))(*values) return (ctype * len(values))(*values)
def param_dict_to_str(data): def param_dict_to_str(data):
if not data: if data is None or not data:
return "" return ""
pairs = [] pairs = []
for key, val in data.items(): for key, val in data.items():
...@@ -122,7 +119,7 @@ def param_dict_to_str(data): ...@@ -122,7 +119,7 @@ def param_dict_to_str(data):
elif isinstance(val, (list, tuple, set)): elif isinstance(val, (list, tuple, set)):
pairs.append(str(key)+'='+','.join(map(str, val))) pairs.append(str(key)+'='+','.join(map(str, val)))
else: else:
raise TypeError('unknow type of parameter:%s , got:%s' raise TypeError('Unknown type of parameter:%s, got:%s'
% (key, type(val).__name__)) % (key, type(val).__name__))
return ' '.join(pairs) return ' '.join(pairs)
...@@ -158,10 +155,10 @@ def c_float_array(data): ...@@ -158,10 +155,10 @@ def c_float_array(data):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64 type_data = C_API_DTYPE_FLOAT64
else: else:
raise TypeError("expected np.float32 or np.float64, met type({})" raise TypeError("Expected np.float32 or np.float64, met type({})"
.format(data.dtype)) .format(data.dtype))
else: else:
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data) return (ptr_data, type_data)
def c_int_array(data): def c_int_array(data):
...@@ -176,16 +173,16 @@ def c_int_array(data): ...@@ -176,16 +173,16 @@ def c_int_array(data):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
type_data = C_API_DTYPE_INT64 type_data = C_API_DTYPE_INT64
else: else:
raise TypeError("expected np.int32 or np.int64, met type({})" raise TypeError("Expected np.int32 or np.int64, met type({})"
.format(data.dtype)) .format(data.dtype))
else: else:
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data) return (ptr_data, type_data)
class _InnerPredictor(object): class _InnerPredictor(object):
""" """
A _InnerPredictor of LightGBM. A _InnerPredictor of LightGBM.
Only used for prediction, usually used for continued-train Only used for prediction, usually used for continued-train
Note: Can convert from Booster, but cannot convert to Booster Note: Can convert from Booster, but cannot convert to Booster
""" """
def __init__(self, model_file=None, booster_handle=None): def __init__(self, model_file=None, booster_handle=None):
...@@ -261,7 +258,7 @@ class _InnerPredictor(object): ...@@ -261,7 +258,7 @@ class _InnerPredictor(object):
Prediction result Prediction result
""" """
if isinstance(data, (_InnerDataset, Dataset)): if isinstance(data, (_InnerDataset, Dataset)):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead") raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE predict_type = C_API_PREDICT_RAW_SCORE
...@@ -290,7 +287,7 @@ class _InnerPredictor(object): ...@@ -290,7 +287,7 @@ class _InnerPredictor(object):
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type) predict_type)
elif IS_PANDAS_INSTALLED and isinstance(data, DataFrame): elif isinstance(data, DataFrame):
preds, nrow = self.__pred_for_np2d(data.values, num_iteration, preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
predict_type) predict_type)
else: else:
...@@ -299,15 +296,14 @@ class _InnerPredictor(object): ...@@ -299,15 +296,14 @@ class _InnerPredictor(object):
preds, nrow = self.__pred_for_csr(csr, num_iteration, preds, nrow = self.__pred_for_csr(csr, num_iteration,
predict_type) predict_type)
except: except:
raise TypeError('can not predict data for type {}'. raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
format(type(data).__name__))
if pred_leaf: if pred_leaf:
preds = preds.astype(np.int32) preds = preds.astype(np.int32)
if is_reshape and preds.size != nrow: if is_reshape and preds.size != nrow:
if preds.size % nrow == 0: if preds.size % nrow == 0:
preds = preds.reshape(nrow, -1) preds = preds.reshape(nrow, -1)
else: else:
raise ValueError('length of predict result (%d) cannot be divide nrow (%d)' raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
% (preds.size, nrow)) % (preds.size, nrow))
return preds return preds
...@@ -353,7 +349,7 @@ class _InnerPredictor(object): ...@@ -353,7 +349,7 @@ class _InnerPredictor(object):
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
)) ))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result") raise ValueError("Wrong length for predict results")
return preds, mat.shape[0] return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type): def __pred_for_csr(self, csr, num_iteration, predict_type):
...@@ -384,7 +380,7 @@ class _InnerPredictor(object): ...@@ -384,7 +380,7 @@ class _InnerPredictor(object):
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
)) ))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result") raise ValueError("Wrong length for predict results")
return preds, nrow return preds, nrow
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
...@@ -481,10 +477,10 @@ class _InnerDataset(object): ...@@ -481,10 +477,10 @@ class _InnerDataset(object):
elif isinstance(name, int): elif isinstance(name, int):
categorical_indices.add(name) categorical_indices.add(name)
else: else:
raise TypeError("unknown type({}) or unknown name({}) in categorical_feature" \ raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature" \
.format(type(name).__name__, name)) .format(type(name).__name__, name))
params['categorical_column'] = categorical_indices params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
"""process for reference dataset""" """process for reference dataset"""
...@@ -514,11 +510,11 @@ class _InnerDataset(object): ...@@ -514,11 +510,11 @@ class _InnerDataset(object):
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset) self.__init_from_csr(csr, params_str, ref_dataset)
except: except:
raise TypeError('can not initialize _InnerDataset from {}'.format(type(data).__name__)) raise TypeError('Cannot initialize _InnerDataset from {}'.format(type(data).__name__))
if label is not None: if label is not None:
self.set_label(label) self.set_label(label)
if self.get_label() is None: if self.get_label() is None:
raise ValueError("label should not be None") raise ValueError("Label should not be None")
if weight is not None: if weight is not None:
self.set_weight(weight) self.set_weight(weight)
if group is not None: if group is not None:
...@@ -572,7 +568,7 @@ class _InnerDataset(object): ...@@ -572,7 +568,7 @@ class _InnerDataset(object):
""" """
Get subset of current dataset Get subset of current dataset
""" """
used_indices = list_to_1d_numpy(used_indices, np.int32) used_indices = list_to_1d_numpy(used_indices, np.int32, name='used_indices')
ret = _InnerDataset(None) ret = _InnerDataset(None)
ret.handle = ctypes.c_void_p() ret.handle = ctypes.c_void_p()
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
...@@ -585,7 +581,7 @@ class _InnerDataset(object): ...@@ -585,7 +581,7 @@ class _InnerDataset(object):
ret.max_bin = self.max_bin ret.max_bin = self.max_bin
ret.predictor = self.predictor ret.predictor = self.predictor
if ret.get_label() is None: if ret.get_label() is None:
raise ValueError("label should not be None") raise ValueError("Label should not be None")
return ret return ret
def set_feature_name(self, feature_name): def set_feature_name(self, feature_name):
...@@ -595,7 +591,7 @@ class _InnerDataset(object): ...@@ -595,7 +591,7 @@ class _InnerDataset(object):
if feature_name is None: if feature_name is None:
return return
if len(feature_name) != self.num_feature(): if len(feature_name) != self.num_feature():
raise ValueError("size of feature_name error") raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
c_feature_name = [c_str(name) for name in feature_name] c_feature_name = [c_str(name) for name in feature_name]
_safe_call(_LIB.LGBM_DatasetSetFeatureNames( _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
self.handle, self.handle,
...@@ -632,7 +628,7 @@ class _InnerDataset(object): ...@@ -632,7 +628,7 @@ class _InnerDataset(object):
Initialize data from a CSR matrix. Initialize data from a CSR matrix.
""" """
if len(csr.indices) != len(csr.data): if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr) ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
...@@ -685,7 +681,7 @@ class _InnerDataset(object): ...@@ -685,7 +681,7 @@ class _InnerDataset(object):
elif out_type.value == C_API_DTYPE_FLOAT32: elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
else: else:
raise TypeError("unknow type") raise TypeError("Unknown type")
def set_field(self, field_name, data): def set_field(self, field_name, data):
"""Set property into the _InnerDataset. """Set property into the _InnerDataset.
...@@ -707,11 +703,8 @@ class _InnerDataset(object): ...@@ -707,11 +703,8 @@ class _InnerDataset(object):
0, 0,
FIELD_TYPE_MAPPER[field_name])) FIELD_TYPE_MAPPER[field_name]))
return return
if IS_PANDAS_INSTALLED and isinstance(data, Series): dtype = np.int32 if field_name == 'group' else np.float32
dtype = np.int32 if field_name == 'group' else np.float32 data = list_to_1d_numpy(data, dtype, name=field_name)
data = data.astype(dtype).values
if not is_numpy_1d_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__))
if data.dtype == np.float32: if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32 type_data = C_API_DTYPE_FLOAT32
...@@ -719,9 +712,9 @@ class _InnerDataset(object): ...@@ -719,9 +712,9 @@ class _InnerDataset(object):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32 type_data = C_API_DTYPE_INT32
else: else:
raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype)) raise TypeError("Excepted np.float32 or np.int32, meet type({})".format(data.dtype))
if type_data != FIELD_TYPE_MAPPER[field_name]: if type_data != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("type error for set_field") raise TypeError("Input type error for set_field")
_safe_call(_LIB.LGBM_DatasetSetField( _safe_call(_LIB.LGBM_DatasetSetField(
self.handle, self.handle,
c_str(field_name), c_str(field_name),
...@@ -749,7 +742,7 @@ class _InnerDataset(object): ...@@ -749,7 +742,7 @@ class _InnerDataset(object):
label: numpy array or list or None label: numpy array or list or None
The label information to be set into _InnerDataset The label information to be set into _InnerDataset
""" """
label = list_to_1d_numpy(label, np.float32) label = list_to_1d_numpy(label, name='label')
self.set_field('label', label) self.set_field('label', label)
def set_weight(self, weight): def set_weight(self, weight):
...@@ -761,7 +754,7 @@ class _InnerDataset(object): ...@@ -761,7 +754,7 @@ class _InnerDataset(object):
Weight for each data point Weight for each data point
""" """
if weight is not None: if weight is not None:
weight = list_to_1d_numpy(weight, np.float32) weight = list_to_1d_numpy(weight, name='weight')
self.set_field('weight', weight) self.set_field('weight', weight)
def set_init_score(self, score): def set_init_score(self, score):
...@@ -773,7 +766,7 @@ class _InnerDataset(object): ...@@ -773,7 +766,7 @@ class _InnerDataset(object):
Init score for booster Init score for booster
""" """
if score is not None: if score is not None:
score = list_to_1d_numpy(score, np.float32) score = list_to_1d_numpy(score, name='init score')
self.set_field('init_score', score) self.set_field('init_score', score)
def set_group(self, group): def set_group(self, group):
...@@ -785,7 +778,7 @@ class _InnerDataset(object): ...@@ -785,7 +778,7 @@ class _InnerDataset(object):
Group size of each group Group size of each group
""" """
if group is not None: if group is not None:
group = list_to_1d_numpy(group, np.int32) group = list_to_1d_numpy(group, np.int32, name='group')
self.set_field('group', group) self.set_field('group', group)
def get_label(self): def get_label(self):
...@@ -940,8 +933,9 @@ class Dataset(object): ...@@ -940,8 +933,9 @@ class Dataset(object):
self.used_indices, self.params) self.used_indices, self.params)
else: else:
self.inner_dataset = _InnerDataset(self.data, self.label, self.max_bin, self.inner_dataset = _InnerDataset(self.data, self.label, self.max_bin,
None, self.weight, self.group, self._predictor, None, self.weight, self.group, self._predictor,
self.silent, self.feature_name, self.categorical_feature, self.params) self.silent, self.feature_name,
self.categorical_feature, self.params)
if self.free_raw_data: if self.free_raw_data:
self.data = None self.data = None
...@@ -994,7 +988,7 @@ class Dataset(object): ...@@ -994,7 +988,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
reference : Dataset reference : Dataset
will use reference as template to consturct current dataset Will use reference as template to consturct current dataset
""" """
self.set_categorical_feature(reference.categorical_feature) self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name) self.set_feature_name(reference.feature_name)
...@@ -1015,7 +1009,7 @@ class Dataset(object): ...@@ -1015,7 +1009,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
feature_name : list of str feature_name : list of str
feature names Feature names
""" """
self.feature_name = feature_name self.feature_name = feature_name
if self.__is_constructed(): if self.__is_constructed():
...@@ -1028,9 +1022,9 @@ class Dataset(object): ...@@ -1028,9 +1022,9 @@ class Dataset(object):
Parameters Parameters
---------- ----------
used_indices : list of int used_indices : list of int
use indices of this subset Used indices of this subset
params : dict params : dict
other parameters Other parameters
""" """
ret = Dataset(None) ret = Dataset(None)
ret.feature_name = self.feature_name ret.feature_name = self.feature_name
...@@ -1198,7 +1192,7 @@ class Booster(object): ...@@ -1198,7 +1192,7 @@ class Booster(object):
if train_set is not None: if train_set is not None:
"""Training task""" """Training task"""
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError('training data should be Dataset instance, met {}'.format(type(train_set).__name__)) raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
"""construct booster object""" """construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate( _safe_call(_LIB.LGBM_BoosterCreate(
...@@ -1237,7 +1231,7 @@ class Booster(object): ...@@ -1237,7 +1231,7 @@ class Booster(object):
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value self.__num_class = out_num_class.value
else: else:
raise TypeError('At least need training dataset or model file to create booster instance') raise TypeError('Need at least one training dataset or model file to create booster instance')
def __del__(self): def __del__(self):
if self.handle is not None: if self.handle is not None:
...@@ -1342,22 +1336,10 @@ class Booster(object): ...@@ -1342,22 +1336,10 @@ class Booster(object):
------- -------
is_finished, bool is_finished, bool
""" """
if not is_numpy_1d_array(grad): grad = list_to_1d_numpy(grad, name='gradient')
if is_1d_list(grad): hess = list_to_1d_numpy(hess, name='hessian')
grad = np.array(grad, dtype=np.float32, copy=False)
else:
raise TypeError("grad should be numpy 1d array or 1d list")
if not is_numpy_1d_array(hess):
if is_1d_list(hess):
hess = np.array(hess, dtype=np.float32, copy=False)
else:
raise TypeError("hess should be numpy 1d array or 1d list")
if len(grad) != len(hess): if len(grad) != len(hess):
raise ValueError('grad / hess lengths mismatch: {} / {}'.format(len(grad), len(hess))) raise ValueError("Lengths of gradient({}) and hessian({}) don't match".format(len(grad), len(hess)))
if grad.dtype != np.float32:
grad = grad.astype(np.float32, copy=False)
if hess.dtype != np.float32:
hess = hess.astype(np.float32, copy=False)
is_finished = ctypes.c_int(0) is_finished = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom( _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
self.handle, self.handle,
...@@ -1548,7 +1530,7 @@ class Booster(object): ...@@ -1548,7 +1530,7 @@ class Booster(object):
Evaulate training or validation data Evaulate training or validation data
""" """
if data_idx >= self.__num_dataset: if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset") raise ValueError("Data_idx should be smaller than number of dataset")
self.__get_eval_info() self.__get_eval_info()
ret = [] ret = []
if self.__num_inner_eval > 0: if self.__num_inner_eval > 0:
...@@ -1560,7 +1542,7 @@ class Booster(object): ...@@ -1560,7 +1542,7 @@ class Booster(object):
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))) result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if tmp_out_len.value != self.__num_inner_eval: if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results") raise ValueError("Wrong length of eval results")
for i in range(self.__num_inner_eval): for i in range(self.__num_inner_eval):
ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i])) ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
if feval is not None: if feval is not None:
...@@ -1582,7 +1564,7 @@ class Booster(object): ...@@ -1582,7 +1564,7 @@ class Booster(object):
Predict for training and validation dataset Predict for training and validation dataset
""" """
if data_idx >= self.__num_dataset: if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset") raise ValueError("Data_idx should be smaller than number of dataset")
if self.__inner_predict_buffer[data_idx] is None: if self.__inner_predict_buffer[data_idx] is None:
if data_idx == 0: if data_idx == 0:
n_preds = self.train_set.num_data() * self.__num_class n_preds = self.train_set.num_data() * self.__num_class
...@@ -1600,7 +1582,7 @@ class Booster(object): ...@@ -1600,7 +1582,7 @@ class Booster(object):
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
data_ptr)) data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" % (data_idx)) raise ValueError("Wrong length of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx] return self.__inner_predict_buffer[data_idx]
...@@ -1626,7 +1608,7 @@ class Booster(object): ...@@ -1626,7 +1608,7 @@ class Booster(object):
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
ptr_string_buffers)) ptr_string_buffers))
if self.__num_inner_eval != tmp_out_len.value: if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals") raise ValueError("Length of eval names doesn't equal with num_evals")
self.__name_inner_eval = \ self.__name_inner_eval = \
[string_buffers[i].value.decode() for i in range(self.__num_inner_eval)] [string_buffers[i].value.decode() for i in range(self.__num_inner_eval)]
self.__higher_better_inner_eval = \ self.__higher_better_inner_eval = \
...@@ -1658,7 +1640,7 @@ class Booster(object): ...@@ -1658,7 +1640,7 @@ class Booster(object):
for key, value in kwargs.items(): for key, value in kwargs.items():
if value is not None: if value is not None:
if not is_str(value): if not is_str(value):
raise ValueError("set_attr only accepts string values") raise ValueError("Set attr only accepts strings")
self.__attr[key] = value self.__attr[key] = value
else: else:
self.__attr.pop(key, None) self.__attr.pop(key, None)
...@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True): ...@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True):
else: else:
return '%s\'s %s:%g' % (value[0], value[1], value[2]) return '%s\'s %s:%g' % (value[0], value[1], value[2])
else: else:
raise ValueError("wrong metric value") raise ValueError("Wrong metric value")
def print_evaluation(period=1, show_stdv=True): def print_evaluation(period=1, show_stdv=True):
...@@ -80,7 +80,7 @@ def record_evaluation(eval_result): ...@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
The requested callback function. The requested callback function.
""" """
if not isinstance(eval_result, dict): if not isinstance(eval_result, dict):
raise TypeError('eval_result has to be a dictionary') raise TypeError('Eval_result should be a dictionary')
eval_result.clear() eval_result.clear()
def init(env): def init(env):
...@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True):
def init(env): def init(env):
"""internal function""" """internal function"""
if not env.evaluation_result_list: if not env.evaluation_result_list:
raise ValueError('For early stopping you need at least one set in evals.') raise ValueError('For early stopping, at least one dataset is required for evaluation')
if verbose: if verbose:
msg = "Train until valid scores didn't improve in {} rounds." msg = "Train until valid scores didn't improve in {} rounds."
...@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True):
if env.model is not None: if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i])) env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose: if verbose:
print('early stopping, best iteration is:') print('Early stopping, best iteration is:')
print(best_msg[i]) print(best_msg[i])
raise EarlyStopException(best_iter[i]) raise EarlyStopException(best_iter[i])
callback.order = 30 callback.order = 30
......
...@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100, ...@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100,
predictor = init_model._to_predictor() predictor = init_model._to_predictor()
else: else:
predictor = None predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0 init_iteration = predictor.num_total_iteration if predictor is not None else 0
"""check dataset""" """check dataset"""
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig") raise TypeError("Traninig only accepts Dataset object")
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name) train_set.set_feature_name(feature_name)
...@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name = "training" train_data_name = "training"
reduced_valid_sets = [] reduced_valid_sets = []
name_valid_sets = [] name_valid_sets = []
if valid_sets: if valid_sets is not None:
if isinstance(valid_sets, Dataset): if isinstance(valid_sets, Dataset):
valid_sets = [valid_sets] valid_sets = [valid_sets]
if isinstance(valid_names, str): if isinstance(valid_names, str):
...@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name = valid_names[i] train_data_name = valid_names[i]
continue continue
if not isinstance(valid_data, Dataset): if not isinstance(valid_data, Dataset):
raise TypeError("only can accept Dataset instance for traninig") raise TypeError("Traninig only accepts Dataset object")
valid_data.set_reference(train_set) valid_data.set_reference(train_set)
reduced_valid_sets.append(valid_data) reduced_valid_sets.append(valid_data)
if valid_names is not None and len(valid_names) > i: if valid_names is not None and len(valid_names) > i:
...@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100,
name_valid_sets.append('valid_'+str(i)) name_valid_sets.append('valid_'+str(i))
"""process callbacks""" """process callbacks"""
if not callbacks: if callbacks is None:
callbacks = set() callbacks = set()
else: else:
for i, cb in enumerate(callbacks): for i, cb in enumerate(callbacks):
...@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100,
elif isinstance(verbose_eval, int): elif isinstance(verbose_eval, int):
callbacks.add(callback.print_evaluation(verbose_eval)) callbacks.add(callback.print_evaluation(verbose_eval))
if early_stopping_rounds: if early_stopping_rounds is not None:
callbacks.add(callback.early_stop(early_stopping_rounds, callbacks.add(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval))) verbose=bool(verbose_eval)))
...@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100,
evaluation_result_list = [] evaluation_result_list = []
# check evaluation result. # check evaluation result.
if valid_sets: if valid_sets is not None:
if is_valid_contain_train: if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval)) evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval)) evaluation_result_list.extend(booster.eval_valid(feval))
...@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals ...@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed) sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())] idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
else: else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv') raise LightGBMError('Scikit-learn is required for stratified cv')
else: else:
full_data.construct() full_data.construct()
randidx = np.random.permutation(full_data.num_data()) randidx = np.random.permutation(full_data.num_data())
...@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
evaluation history : list(string) evaluation history : list(string)
""" """
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig") raise TypeError("Traninig only accepts Dataset object")
if is_str(init_model): if is_str(init_model):
predictor = _InnerPredictor(model_file=init_model) predictor = _InnerPredictor(model_file=init_model)
...@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified) cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
# setup callbacks # setup callbacks
if not callbacks: if callbacks is None:
callbacks = set() callbacks = set()
else: else:
for i, cb in enumerate(callbacks): for i, cb in enumerate(callbacks):
cb.__dict__.setdefault('order', i - len(callbacks)) cb.__dict__.setdefault('order', i - len(callbacks))
callbacks = set(callbacks) callbacks = set(callbacks)
if early_stopping_rounds: if early_stopping_rounds is not None:
callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False)) callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False))
if verbose_eval is True: if verbose_eval is True:
callbacks.add(callback.print_evaluation(show_stdv=show_stdv)) callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
......
...@@ -6,7 +6,7 @@ from __future__ import absolute_import ...@@ -6,7 +6,7 @@ from __future__ import absolute_import
import numpy as np import numpy as np
from .basic import LightGBMError, Dataset, is_str from .basic import LightGBMError, Dataset, is_str
from .engine import train from .engine import train
# sklearn '''sklearn'''
try: try:
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin from sklearn.base import RegressorMixin, ClassifierMixin
...@@ -38,7 +38,6 @@ def _point_wise_objective(func): ...@@ -38,7 +38,6 @@ def _point_wise_objective(func):
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class) y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values The predicted values
Returns Returns
------- -------
new_func: callable new_func: callable
...@@ -66,7 +65,7 @@ def _point_wise_objective(func): ...@@ -66,7 +65,7 @@ def _point_wise_objective(func):
num_data = len(weight) num_data = len(weight)
num_class = len(grad) // num_data num_class = len(grad) // num_data
if num_class * num_data != len(grad): if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal to num_class * num_data") raise ValueError("Length of grad and hess should equal to num_class * num_data")
for k in range(num_class): for k in range(num_class):
for i in range(num_data): for i in range(num_data):
idx = k * num_data + i idx = k * num_data + i
...@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase): ...@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase):
reg_alpha=0, reg_lambda=0, scale_pos_weight=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0): is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module') raise LightGBMError('Scikit-learn is required for this module')
self.num_leaves = num_leaves self.num_leaves = num_leaves
self.max_depth = max_depth self.max_depth = max_depth
...@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase): ...@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase):
booster : a lightgbm booster of underlying model booster : a lightgbm booster of underlying model
""" """
if self._Booster is None: if self._Booster is None:
raise LightGBMError('need to call fit beforehand') raise LightGBMError('Need to call fit beforehand')
return self._Booster return self._Booster
def get_params(self, deep=False): def get_params(self, deep=False):
...@@ -196,8 +195,8 @@ class LGBMModel(LGBMModelBase): ...@@ -196,8 +195,8 @@ class LGBMModel(LGBMModelBase):
return params return params
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, group=None, sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None, eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
...@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase): ...@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase):
if self.evals_result_: if self.evals_result_:
evals_result = self.evals_result_ evals_result = self.evals_result_
else: else:
raise LightGBMError('No results.') raise LightGBMError('No results found.')
return evals_result return evals_result
...@@ -362,7 +361,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -362,7 +361,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None, eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_init_score=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
...@@ -370,10 +369,10 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -370,10 +369,10 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
other_params=None): other_params=None):
super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None, super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
eval_set, eval_sample_weight, eval_init_score, None, eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds, eval_metric, early_stopping_rounds,
verbose, feature_name, categorical_feature, verbose, feature_name, categorical_feature,
other_params) other_params)
return self return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase): class LGBMClassifier(LGBMModel, LGBMClassifierBase):
...@@ -390,15 +389,15 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -390,15 +389,15 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
is_unbalance=False, seed=0): is_unbalance=False, seed=0):
super(LGBMClassifier, self).__init__(num_leaves, max_depth, super(LGBMClassifier, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin, learning_rate, n_estimators, max_bin,
silent, objective, silent, objective, nthread,
nthread, min_split_gain, min_child_weight, min_child_samples, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree, subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight, reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed) is_unbalance, seed)
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None, eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_init_score=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
...@@ -480,7 +479,7 @@ def _group_wise_objective(func): ...@@ -480,7 +479,7 @@ def _group_wise_objective(func):
labels = dataset.get_label() labels = dataset.get_label()
group = dataset.get_group() group = dataset.get_group()
if group is None: if group is None:
raise ValueError("group should not be None for ranking task") raise ValueError("Group should not be None for ranking task")
grad, hess = func(labels, group, preds) grad, hess = func(labels, group, preds)
"""weighted for objective""" """weighted for objective"""
weight = dataset.get_weight() weight = dataset.get_weight()
...@@ -490,7 +489,7 @@ def _group_wise_objective(func): ...@@ -490,7 +489,7 @@ def _group_wise_objective(func):
grad = np.multiply(grad, weight) grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight) hess = np.multiply(hess, weight)
else: else:
raise ValueError("lenght of grad and hess should equal with num_data") raise ValueError("Length of grad and hess should equal with num_data")
return grad, hess return grad, hess
return inner return inner
...@@ -507,20 +506,20 @@ class LGBMRanker(LGBMModel): ...@@ -507,20 +506,20 @@ class LGBMRanker(LGBMModel):
reg_alpha=0, reg_lambda=0, scale_pos_weight=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0): is_unbalance=False, seed=0):
super(LGBMRanker, self).__init__(num_leaves, max_depth, super(LGBMRanker, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin, learning_rate, n_estimators, max_bin,
silent, objective, silent, objective, nthread,
nthread, min_split_gain, min_child_weight, min_child_samples, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree, subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight, reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed) is_unbalance, seed)
if callable(self.objective): if callable(self.objective):
self.fobj = _group_wise_objective(self.objective) self.fobj = _group_wise_objective(self.objective)
else: else:
self.fobj = None self.fobj = None
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, group=None, sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None, eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric=None, eval_at=None, eval_metric=None, eval_at=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
...@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel): ...@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel):
"""check group data""" """check group data"""
if group is None: if group is None:
raise ValueError("should use group for ranking task") raise ValueError("Should set group for ranking task")
if eval_set is not None: if eval_set is not None:
if eval_group is None: if eval_group is None:
raise ValueError("eval_group cannot be None when eval_set is not None") raise ValueError("Eval_group cannot be None when eval_set is not None")
elif len(eval_group) != len(eval_set): elif len(eval_group) != len(eval_set):
raise ValueError("length of eval_group should equal with eval_set") raise ValueError("Length of eval_group should equal to eval_set")
else: else:
for inner_group in eval_group: for inner_group in eval_group:
if inner_group is None: if inner_group is None:
raise ValueError("should set group for all eval data for ranking task") raise ValueError("Should set group for all eval dataset for ranking task")
if eval_at is not None: if eval_at is not None:
other_params = {} if other_params is None else other_params other_params = {} if other_params is None else other_params
other_params['ndcg_eval_at'] = list(eval_at) other_params['ndcg_eval_at'] = list(eval_at)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment