Commit ebfc8521 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add an advanced example; add guide-python README.md details; clean error messages (#117)

parent b51c7be4
......@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example:
```
python simple_example.py
```
Examples including:
- [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
- Construct Dataset
- Basic train and predict
- Eval during training
- Early stopping
- Save model to file
- Dump model to json format
- Feature importances
- [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
- Basic train and predict with sklearn interface
- Feature importances with sklearn interface
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Set feature names
- Directly use categorical features without one-hot encoding
- Load model file to continue training
- Change learning rates during training
- Self-defined objective function
- Self-defined eval metric
- Callback function
\ No newline at end of file
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
import numpy as np
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
# specify your configurations as a dict
params = {
'boosting_type' : 'gbdt',
'objective' : 'binary',
'metric' : 'binary_logloss',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
'verbose' : 0
}
# generate a feature name
feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
# check feature name
print('Finish first 10 rounds...')
print('7th feature name is:', repr(lgb_train.feature_name[6]))
# save model to file
gbm.save_model('model.txt')
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finish 10 - 20 rounds with model file...')
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
# 3. function(curr_iter, total_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('Finish 20 - 30 rounds with decay learning rates...')
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
return grad, hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def binary_error(preds, train_data):
labels = train_data.get_label()
return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelood,
feval=binary_error,
valid_sets=lgb_eval)
print('Finish 30 - 40 rounds with self-defined objective function and eval metric...')
print('Start a new training job...')
# callback
def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new valid')
callback.before_iteration = True
callback.order = 0
return callback
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
print('Finish first 10 rounds with callback function...')
......@@ -6,6 +6,7 @@ import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
......@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task' : 'train',
......@@ -33,27 +33,32 @@ params = {
'verbose' : 0
}
print('Start training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=10)
early_stopping_rounds=5)
print('Save model...')
# save model to file
gbm.save_model('model.txt')
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Dump model to JSON...')
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
print('Calculate feature importances...')
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
print('Feature importances:', list(gbm.feature_importance()))
# print('Feature importances:', list(gbm.feature_importance("gain")))
......@@ -5,6 +5,7 @@ import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
......@@ -13,19 +14,23 @@ y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
print('Start training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=100)
n_estimators=20)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
eval_metric='l1',
early_stopping_rounds=5)
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Calculate feature importances...')
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', list(gbm.feature_importance()))
# coding: utf-8
# pylint: disable = invalid-name, C0111, C0301, R0912, R0913, R0914, W0105
# pylint: disable = invalid-name, C0111, C0301
# pylint: disable = R0912, R0913, R0914, W0105, W0201, W0212
# pylint: disable = E1101
"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import
......@@ -17,13 +18,11 @@ from .libpath import find_lib_path
"""pandas"""
try:
from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True
except ImportError:
class Series(object):
pass
class DataFrame(object):
pass
IS_PANDAS_INSTALLED = False
IS_PY3 = (sys.version_info[0] == 3)
......@@ -72,7 +71,7 @@ def is_1d_list(data):
return isinstance(data, list) and \
(not data or isinstance(data[0], (int, float, bool)))
def list_to_1d_numpy(data, dtype):
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""convert to 1d numpy array"""
if is_numpy_1d_array(data):
if data.dtype == dtype:
......@@ -81,28 +80,26 @@ def list_to_1d_numpy(data, dtype):
return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif IS_PANDAS_INSTALLED and isinstance(data, Series):
return data.astype(dtype).values
elif isinstance(data, Series):
return data.values.astype(dtype)
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name))
def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
res = np.fromiter(cptr, dtype=np.float32, count=length)
return res
return np.fromiter(cptr, dtype=np.float32, count=length)
else:
raise RuntimeError('expected float pointer')
raise RuntimeError('Expected float pointer')
def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
res = np.fromiter(cptr, dtype=np.int32, count=length)
return res
return np.fromiter(cptr, dtype=np.int32, count=length)
else:
raise RuntimeError('expected int pointer')
raise RuntimeError('Expected int pointer')
def c_str(string):
"""Convert a python string to cstring."""
......@@ -113,7 +110,7 @@ def c_array(ctype, values):
return (ctype * len(values))(*values)
def param_dict_to_str(data):
if not data:
if data is None or not data:
return ""
pairs = []
for key, val in data.items():
......@@ -122,7 +119,7 @@ def param_dict_to_str(data):
elif isinstance(val, (list, tuple, set)):
pairs.append(str(key)+'='+','.join(map(str, val)))
else:
raise TypeError('unknow type of parameter:%s , got:%s'
raise TypeError('Unknown type of parameter:%s, got:%s'
% (key, type(val).__name__))
return ' '.join(pairs)
......@@ -158,10 +155,10 @@ def c_float_array(data):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64
else:
raise TypeError("expected np.float32 or np.float64, met type({})"
raise TypeError("Expected np.float32 or np.float64, met type({})"
.format(data.dtype))
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data)
def c_int_array(data):
......@@ -176,10 +173,10 @@ def c_int_array(data):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
type_data = C_API_DTYPE_INT64
else:
raise TypeError("expected np.int32 or np.int64, met type({})"
raise TypeError("Expected np.int32 or np.int64, met type({})"
.format(data.dtype))
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
raise TypeError("Unknown type({})".format(type(data).__name__))
return (ptr_data, type_data)
class _InnerPredictor(object):
......@@ -261,7 +258,7 @@ class _InnerPredictor(object):
Prediction result
"""
if isinstance(data, (_InnerDataset, Dataset)):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE
......@@ -290,7 +287,7 @@ class _InnerPredictor(object):
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type)
elif IS_PANDAS_INSTALLED and isinstance(data, DataFrame):
elif isinstance(data, DataFrame):
preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
predict_type)
else:
......@@ -299,15 +296,14 @@ class _InnerPredictor(object):
preds, nrow = self.__pred_for_csr(csr, num_iteration,
predict_type)
except:
raise TypeError('can not predict data for type {}'.
format(type(data).__name__))
raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
if pred_leaf:
preds = preds.astype(np.int32)
if is_reshape and preds.size != nrow:
if preds.size % nrow == 0:
preds = preds.reshape(nrow, -1)
else:
raise ValueError('length of predict result (%d) cannot be divide nrow (%d)'
raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
% (preds.size, nrow))
return preds
......@@ -353,7 +349,7 @@ class _InnerPredictor(object):
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
raise ValueError("Wrong length for predict results")
return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type):
......@@ -384,7 +380,7 @@ class _InnerPredictor(object):
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
raise ValueError("Wrong length for predict results")
return preds, nrow
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
......@@ -481,10 +477,10 @@ class _InnerDataset(object):
elif isinstance(name, int):
categorical_indices.add(name)
else:
raise TypeError("unknown type({}) or unknown name({}) in categorical_feature" \
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature" \
.format(type(name).__name__, name))
params['categorical_column'] = categorical_indices
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params)
"""process for reference dataset"""
......@@ -514,11 +510,11 @@ class _InnerDataset(object):
csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset)
except:
raise TypeError('can not initialize _InnerDataset from {}'.format(type(data).__name__))
raise TypeError('Cannot initialize _InnerDataset from {}'.format(type(data).__name__))
if label is not None:
self.set_label(label)
if self.get_label() is None:
raise ValueError("label should not be None")
raise ValueError("Label should not be None")
if weight is not None:
self.set_weight(weight)
if group is not None:
......@@ -572,7 +568,7 @@ class _InnerDataset(object):
"""
Get subset of current dataset
"""
used_indices = list_to_1d_numpy(used_indices, np.int32)
used_indices = list_to_1d_numpy(used_indices, np.int32, name='used_indices')
ret = _InnerDataset(None)
ret.handle = ctypes.c_void_p()
params_str = param_dict_to_str(params)
......@@ -585,7 +581,7 @@ class _InnerDataset(object):
ret.max_bin = self.max_bin
ret.predictor = self.predictor
if ret.get_label() is None:
raise ValueError("label should not be None")
raise ValueError("Label should not be None")
return ret
def set_feature_name(self, feature_name):
......@@ -595,7 +591,7 @@ class _InnerDataset(object):
if feature_name is None:
return
if len(feature_name) != self.num_feature():
raise ValueError("size of feature_name error")
raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
c_feature_name = [c_str(name) for name in feature_name]
_safe_call(_LIB.LGBM_DatasetSetFeatureNames(
self.handle,
......@@ -632,7 +628,7 @@ class _InnerDataset(object):
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
......@@ -685,7 +681,7 @@ class _InnerDataset(object):
elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
else:
raise TypeError("unknow type")
raise TypeError("Unknown type")
def set_field(self, field_name, data):
"""Set property into the _InnerDataset.
......@@ -707,11 +703,8 @@ class _InnerDataset(object):
0,
FIELD_TYPE_MAPPER[field_name]))
return
if IS_PANDAS_INSTALLED and isinstance(data, Series):
dtype = np.int32 if field_name == 'group' else np.float32
data = data.astype(dtype).values
if not is_numpy_1d_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__))
data = list_to_1d_numpy(data, dtype, name=field_name)
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
......@@ -719,9 +712,9 @@ class _InnerDataset(object):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
else:
raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
raise TypeError("Excepted np.float32 or np.int32, meet type({})".format(data.dtype))
if type_data != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("type error for set_field")
raise TypeError("Input type error for set_field")
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
......@@ -749,7 +742,7 @@ class _InnerDataset(object):
label: numpy array or list or None
The label information to be set into _InnerDataset
"""
label = list_to_1d_numpy(label, np.float32)
label = list_to_1d_numpy(label, name='label')
self.set_field('label', label)
def set_weight(self, weight):
......@@ -761,7 +754,7 @@ class _InnerDataset(object):
Weight for each data point
"""
if weight is not None:
weight = list_to_1d_numpy(weight, np.float32)
weight = list_to_1d_numpy(weight, name='weight')
self.set_field('weight', weight)
def set_init_score(self, score):
......@@ -773,7 +766,7 @@ class _InnerDataset(object):
Init score for booster
"""
if score is not None:
score = list_to_1d_numpy(score, np.float32)
score = list_to_1d_numpy(score, name='init score')
self.set_field('init_score', score)
def set_group(self, group):
......@@ -785,7 +778,7 @@ class _InnerDataset(object):
Group size of each group
"""
if group is not None:
group = list_to_1d_numpy(group, np.int32)
group = list_to_1d_numpy(group, np.int32, name='group')
self.set_field('group', group)
def get_label(self):
......@@ -941,7 +934,8 @@ class Dataset(object):
else:
self.inner_dataset = _InnerDataset(self.data, self.label, self.max_bin,
None, self.weight, self.group, self._predictor,
self.silent, self.feature_name, self.categorical_feature, self.params)
self.silent, self.feature_name,
self.categorical_feature, self.params)
if self.free_raw_data:
self.data = None
......@@ -994,7 +988,7 @@ class Dataset(object):
Parameters
----------
reference : Dataset
will use reference as template to consturct current dataset
Will use reference as template to consturct current dataset
"""
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name)
......@@ -1015,7 +1009,7 @@ class Dataset(object):
Parameters
----------
feature_name : list of str
feature names
Feature names
"""
self.feature_name = feature_name
if self.__is_constructed():
......@@ -1028,9 +1022,9 @@ class Dataset(object):
Parameters
----------
used_indices : list of int
use indices of this subset
Used indices of this subset
params : dict
other parameters
Other parameters
"""
ret = Dataset(None)
ret.feature_name = self.feature_name
......@@ -1198,7 +1192,7 @@ class Booster(object):
if train_set is not None:
"""Training task"""
if not isinstance(train_set, Dataset):
raise TypeError('training data should be Dataset instance, met {}'.format(type(train_set).__name__))
raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
params_str = param_dict_to_str(params)
"""construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate(
......@@ -1237,7 +1231,7 @@ class Booster(object):
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
else:
raise TypeError('At least need training dataset or model file to create booster instance')
raise TypeError('Need at least one training dataset or model file to create booster instance')
def __del__(self):
if self.handle is not None:
......@@ -1342,22 +1336,10 @@ class Booster(object):
-------
is_finished, bool
"""
if not is_numpy_1d_array(grad):
if is_1d_list(grad):
grad = np.array(grad, dtype=np.float32, copy=False)
else:
raise TypeError("grad should be numpy 1d array or 1d list")
if not is_numpy_1d_array(hess):
if is_1d_list(hess):
hess = np.array(hess, dtype=np.float32, copy=False)
else:
raise TypeError("hess should be numpy 1d array or 1d list")
grad = list_to_1d_numpy(grad, name='gradient')
hess = list_to_1d_numpy(hess, name='hessian')
if len(grad) != len(hess):
raise ValueError('grad / hess lengths mismatch: {} / {}'.format(len(grad), len(hess)))
if grad.dtype != np.float32:
grad = grad.astype(np.float32, copy=False)
if hess.dtype != np.float32:
hess = hess.astype(np.float32, copy=False)
raise ValueError("Lengths of gradient({}) and hessian({}) don't match".format(len(grad), len(hess)))
is_finished = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
self.handle,
......@@ -1548,7 +1530,7 @@ class Booster(object):
Evaulate training or validation data
"""
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
raise ValueError("Data_idx should be smaller than number of dataset")
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
......@@ -1560,7 +1542,7 @@ class Booster(object):
ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results")
raise ValueError("Wrong length of eval results")
for i in range(self.__num_inner_eval):
ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
if feval is not None:
......@@ -1582,7 +1564,7 @@ class Booster(object):
Predict for training and validation dataset
"""
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
raise ValueError("Data_idx should be smaller than number of dataset")
if self.__inner_predict_buffer[data_idx] is None:
if data_idx == 0:
n_preds = self.train_set.num_data() * self.__num_class
......@@ -1600,7 +1582,7 @@ class Booster(object):
ctypes.byref(tmp_out_len),
data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" % (data_idx))
raise ValueError("Wrong length of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx]
......@@ -1626,7 +1608,7 @@ class Booster(object):
ctypes.byref(tmp_out_len),
ptr_string_buffers))
if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals")
raise ValueError("Length of eval names doesn't equal with num_evals")
self.__name_inner_eval = \
[string_buffers[i].value.decode() for i in range(self.__num_inner_eval)]
self.__higher_better_inner_eval = \
......@@ -1658,7 +1640,7 @@ class Booster(object):
for key, value in kwargs.items():
if value is not None:
if not is_str(value):
raise ValueError("set_attr only accepts string values")
raise ValueError("Set attr only accepts strings")
self.__attr[key] = value
else:
self.__attr.pop(key, None)
......@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True):
else:
return '%s\'s %s:%g' % (value[0], value[1], value[2])
else:
raise ValueError("wrong metric value")
raise ValueError("Wrong metric value")
def print_evaluation(period=1, show_stdv=True):
......@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
The requested callback function.
"""
if not isinstance(eval_result, dict):
raise TypeError('eval_result has to be a dictionary')
raise TypeError('Eval_result should be a dictionary')
eval_result.clear()
def init(env):
......@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True):
def init(env):
"""internal function"""
if not env.evaluation_result_list:
raise ValueError('For early stopping you need at least one set in evals.')
raise ValueError('For early stopping, at least one dataset is required for evaluation')
if verbose:
msg = "Train until valid scores didn't improve in {} rounds."
......@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True):
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best iteration is:')
print('Early stopping, best iteration is:')
print(best_msg[i])
raise EarlyStopException(best_iter[i])
callback.order = 30
......
......@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100,
predictor = init_model._to_predictor()
else:
predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0
init_iteration = predictor.num_total_iteration if predictor is not None else 0
"""check dataset"""
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
raise TypeError("Traninig only accepts Dataset object")
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
......@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name = "training"
reduced_valid_sets = []
name_valid_sets = []
if valid_sets:
if valid_sets is not None:
if isinstance(valid_sets, Dataset):
valid_sets = [valid_sets]
if isinstance(valid_names, str):
......@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name = valid_names[i]
continue
if not isinstance(valid_data, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
raise TypeError("Traninig only accepts Dataset object")
valid_data.set_reference(train_set)
reduced_valid_sets.append(valid_data)
if valid_names is not None and len(valid_names) > i:
......@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100,
name_valid_sets.append('valid_'+str(i))
"""process callbacks"""
if not callbacks:
if callbacks is None:
callbacks = set()
else:
for i, cb in enumerate(callbacks):
......@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100,
elif isinstance(verbose_eval, int):
callbacks.add(callback.print_evaluation(verbose_eval))
if early_stopping_rounds:
if early_stopping_rounds is not None:
callbacks.add(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval)))
......@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100,
evaluation_result_list = []
# check evaluation result.
if valid_sets:
if valid_sets is not None:
if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval))
......@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
raise LightGBMError('Scikit-learn is required for stratified cv')
else:
full_data.construct()
randidx = np.random.permutation(full_data.num_data())
......@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
evaluation history : list(string)
"""
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
raise TypeError("Traninig only accepts Dataset object")
if is_str(init_model):
predictor = _InnerPredictor(model_file=init_model)
......@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
# setup callbacks
if not callbacks:
if callbacks is None:
callbacks = set()
else:
for i, cb in enumerate(callbacks):
cb.__dict__.setdefault('order', i - len(callbacks))
callbacks = set(callbacks)
if early_stopping_rounds:
if early_stopping_rounds is not None:
callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False))
if verbose_eval is True:
callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
......
......@@ -6,7 +6,7 @@ from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Dataset, is_str
from .engine import train
# sklearn
'''sklearn'''
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
......@@ -38,7 +38,6 @@ def _point_wise_objective(func):
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
......@@ -66,7 +65,7 @@ def _point_wise_objective(func):
num_data = len(weight)
num_class = len(grad) // num_data
if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal to num_class * num_data")
raise ValueError("Length of grad and hess should equal to num_class * num_data")
for k in range(num_class):
for i in range(num_data):
idx = k * num_data + i
......@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase):
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module')
raise LightGBMError('Scikit-learn is required for this module')
self.num_leaves = num_leaves
self.max_depth = max_depth
......@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase):
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('need to call fit beforehand')
raise LightGBMError('Need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
......@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase):
if self.evals_result_:
evals_result = self.evals_result_
else:
raise LightGBMError('No results.')
raise LightGBMError('No results found.')
return evals_result
......@@ -390,8 +389,8 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
is_unbalance=False, seed=0):
super(LGBMClassifier, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
silent, objective, nthread,
min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
......@@ -480,7 +479,7 @@ def _group_wise_objective(func):
labels = dataset.get_label()
group = dataset.get_group()
if group is None:
raise ValueError("group should not be None for ranking task")
raise ValueError("Group should not be None for ranking task")
grad, hess = func(labels, group, preds)
"""weighted for objective"""
weight = dataset.get_weight()
......@@ -490,7 +489,7 @@ def _group_wise_objective(func):
grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight)
else:
raise ValueError("lenght of grad and hess should equal with num_data")
raise ValueError("Length of grad and hess should equal with num_data")
return grad, hess
return inner
......@@ -508,8 +507,8 @@ class LGBMRanker(LGBMModel):
is_unbalance=False, seed=0):
super(LGBMRanker, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
silent, objective, nthread,
min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
......@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel):
"""check group data"""
if group is None:
raise ValueError("should use group for ranking task")
raise ValueError("Should set group for ranking task")
if eval_set is not None:
if eval_group is None:
raise ValueError("eval_group cannot be None when eval_set is not None")
raise ValueError("Eval_group cannot be None when eval_set is not None")
elif len(eval_group) != len(eval_set):
raise ValueError("length of eval_group should equal with eval_set")
raise ValueError("Length of eval_group should equal to eval_set")
else:
for inner_group in eval_group:
if inner_group is None:
raise ValueError("should set group for all eval data for ranking task")
raise ValueError("Should set group for all eval dataset for ranking task")
if eval_at is not None:
other_params = {} if other_params is None else other_params
other_params['ndcg_eval_at'] = list(eval_at)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment