Commit 6c248d37 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

suppprt pandas categorical (#193)

* suppprt pandas categorical

* refine logic

* make default=auto

* fix train/valid categorical codes

* add test

* unify set _predictor

* fix tests

* fix warning

* support feature_name=int
parent 00e5b244
......@@ -14,7 +14,7 @@ before_install:
install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn
- conda install --yes atlas numpy scipy scikit-learn pandas
- pip install pep8
......
......@@ -5,8 +5,8 @@
- [Booster](Python-API.md#booster)
* [Training API](Python-API.md#training-api)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_namenone-categorical_featurenone-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_namenone-categorical_featurenone-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
* [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods)
......@@ -31,7 +31,7 @@ The methods of each Class is in alphabetical order.
###Dataset
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name=None, categorical_feature=None, params=None, free_raw_data=True)
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', categorical_feature='auto', params=None, free_raw_data=True)
Parameters
----------
......@@ -50,12 +50,14 @@ The methods of each Class is in alphabetical order.
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or list of int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params : dict, optional
Other parameters
free_raw_data : Bool
......@@ -445,7 +447,7 @@ The methods of each Class is in alphabetical order.
##Training API
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name=None, categorical_feature=None, early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
Train with given parameters.
......@@ -468,12 +470,14 @@ The methods of each Class is in alphabetical order.
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or list of int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -513,7 +517,7 @@ The methods of each Class is in alphabetical order.
booster : a trained booster model
####cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name=None, categorical_feature=None, early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
####cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
Cross-validation with given paramaters.
......@@ -541,11 +545,14 @@ The methods of each Class is in alphabetical order.
Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index,
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
......@@ -693,7 +700,7 @@ The methods of each Class is in alphabetical order.
X_leaves : array_like, shape=[n_samples, n_trees]
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None)
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
Fit the gradient boosting model.
......@@ -724,12 +731,14 @@ The methods of each Class is in alphabetical order.
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well).
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
......@@ -823,7 +832,7 @@ The methods of each Class is in alphabetical order.
###LGBMRanker
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None)
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
Most arguments are same as Common Methods except:
......
......@@ -454,8 +454,29 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data):
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
cat_cols = data.select_dtypes(include=['category']).columns
if not pandas_categorical: # train dataset
pandas_categorical = (data[col].cat.categories for col in cat_cols)
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if data[col].cat.categories != category:
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = data.columns
if categorical_feature == 'auto':
categorical_feature = cat_cols
else:
categorical_feature += cat_cols
if feature_name == 'auto':
feature_name = data.columns
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
......@@ -464,7 +485,12 @@ def _data_from_pandas(data):
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float')
return data
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label):
......@@ -482,7 +508,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False,
feature_name=None, categorical_feature=None, params=None,
feature_name='auto', categorical_feature='auto', params=None,
free_raw_data=True):
"""
Parameters
......@@ -502,12 +528,14 @@ class Dataset(object):
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
......@@ -527,6 +555,7 @@ class Dataset(object):
self.free_raw_data = free_raw_data
self.used_indices = None
self._predictor = None
self.pandas_categorical = None
def __del__(self):
self._free_handle()
......@@ -538,12 +567,12 @@ class Dataset(object):
def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None,
silent=False, feature_name=None,
categorical_feature=None, params=None):
silent=False, feature_name='auto',
categorical_feature='auto', params=None):
if data is None:
self.handle = None
return
data = _data_from_pandas(data)
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
label = _label_from_pandas(label)
self.data_has_header = False
"""process for args"""
......@@ -760,7 +789,8 @@ class Dataset(object):
ret = Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data)
ret._set_predictor(self._predictor)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret
def subset(self, used_indices, params=None):
......@@ -777,6 +807,7 @@ class Dataset(object):
ret = Dataset(None, reference=self, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=params)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
return ret
......@@ -948,7 +979,7 @@ class Dataset(object):
if self.handle is not None and feature_name is not None:
if len(feature_name) != self.num_feature():
raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
c_feature_name = [c_str(name) for name in feature_name]
c_feature_name = [c_str(str(name)) for name in feature_name]
_safe_call(_LIB.LGBM_DatasetSetFeatureNames(
self.handle,
c_array(ctypes.c_char_p, c_feature_name),
......
......@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
"""
......@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -267,7 +269,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
shuffle=True, metrics=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
......@@ -298,11 +300,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index,
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
......
......@@ -280,7 +280,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
callbacks=None):
"""
Fit the gradient boosting model
......@@ -311,12 +311,14 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str
feature_name : list of str, or 'auto'
Feature names
categorical_feature : list of str or int
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
......@@ -506,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None,
eval_metric="l2",
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, callbacks=None):
feature_name='auto', categorical_feature='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set,
......@@ -552,7 +554,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None,
eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
callbacks=None):
self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y)
......@@ -653,7 +655,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
feature_name='auto', categorical_feature='auto',
callbacks=None):
"""
Most arguments like common methods except following:
......
......@@ -12,9 +12,15 @@ from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
try:
import pandas as pd
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
try:
import cPickle as pickle
except:
except ImportError:
import pickle
......@@ -22,31 +28,33 @@ def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params={'objective': 'regression', 'metric': 'l2'},
X_y=load_boston(True), feval=mean_squared_error,
num_round=100, init_model=None, custom_eval=None,
early_stopping_rounds=10,
return_data=False, return_model=False):
params['verbose'], params['seed'] = -1, 42
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data:
return lgb_train, lgb_eval
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=num_round,
valid_sets=lgb_eval,
valid_names='eval',
verbose_eval=False,
feval=custom_eval,
evals_result=evals_result,
early_stopping_rounds=early_stopping_rounds,
init_model=init_model)
if return_model:
return gbm
else:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class template(object):
@staticmethod
def test_template(params={'objective': 'regression', 'metric': 'l2'},
X_y=load_boston(True), feval=mean_squared_error,
num_round=100, init_model=None, custom_eval=None,
early_stopping_rounds=10,
return_data=False, return_model=False):
params['verbose'], params['seed'] = -1, 42
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data:
return lgb_train, lgb_eval
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=num_round,
valid_sets=lgb_eval,
valid_names='eval',
verbose_eval=False,
feval=custom_eval,
evals_result=evals_result,
early_stopping_rounds=early_stopping_rounds,
init_model=init_model)
if return_model:
return gbm
else:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestEngine(unittest.TestCase):
......@@ -57,12 +65,12 @@ class TestEngine(unittest.TestCase):
'objective': 'binary',
'metric': 'binary_logloss'
}
evals_result, ret = test_template(params, X_y, log_loss)
evals_result, ret = template.test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15)
self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5)
def test_regreesion(self):
evals_result, ret = test_template()
evals_result, ret = template.test_template()
ret **= 0.5
self.assertLess(ret, 4)
self.assertAlmostEqual(min(evals_result['eval']['l2']), ret, places=5)
......@@ -74,7 +82,7 @@ class TestEngine(unittest.TestCase):
'metric': 'multi_logloss',
'num_class': 10
}
evals_result, ret = test_template(params, X_y, multi_logloss)
evals_result, ret = template.test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
......@@ -84,11 +92,11 @@ class TestEngine(unittest.TestCase):
'metric': 'l1'
}
model_name = 'model.txt'
gbm = test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1)
gbm = template.test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1)
gbm.save_model(model_name)
evals_result, ret = test_template(params, feval=mean_absolute_error,
num_round=80, init_model=model_name,
custom_eval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)))
evals_result, ret = template.test_template(params, feval=mean_absolute_error,
num_round=80, init_model=model_name,
custom_eval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)))
self.assertLess(ret, 3)
self.assertAlmostEqual(min(evals_result['eval']['l1']), ret, places=5)
for l1, mae in zip(evals_result['eval']['l1'], evals_result['eval']['mae']):
......@@ -104,38 +112,52 @@ class TestEngine(unittest.TestCase):
'metric': 'multi_logloss',
'num_class': 3
}
gbm = test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1)
evals_result, ret = test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm)
gbm = template.test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1)
evals_result, ret = template.test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm)
self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self):
lgb_train, _ = test_template(return_data=True)
lgb_train, _ = template.test_template(return_data=True)
lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5,
metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
def test_save_load_copy_pickle(self):
gbm = test_template(num_round=20, return_model=True)
_, ret_origin = test_template(init_model=gbm)
gbm = template.test_template(num_round=20, return_model=True)
_, ret_origin = template.test_template(init_model=gbm)
other_ret = []
gbm.save_model('lgb.model')
other_ret.append(test_template(init_model='lgb.model')[1])
other_ret.append(template.test_template(init_model='lgb.model')[1])
gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(test_template(init_model=gbm_load)[1])
other_ret.append(test_template(init_model=copy.copy(gbm))[1])
other_ret.append(test_template(init_model=copy.deepcopy(gbm))[1])
other_ret.append(template.test_template(init_model=gbm_load)[1])
other_ret.append(template.test_template(init_model=copy.copy(gbm))[1])
other_ret.append(template.test_template(init_model=copy.deepcopy(gbm))[1])
with open('lgb.pkl', 'wb') as f:
pickle.dump(gbm, f)
with open('lgb.pkl', 'rb') as f:
gbm_pickle = pickle.load(f)
other_ret.append(test_template(init_model=gbm_pickle)[1])
other_ret.append(template.test_template(init_model=gbm_pickle)[1])
gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(test_template(init_model=gbm_pickles)[1])
other_ret.append(template.test_template(init_model=gbm_pickles)[1])
for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5)
@unittest.skipIf(not IS_PANDAS_INSTALLED, 'pandas not installed')
def test_pandas_categorical(self):
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),
"B": np.random.permutation([1, 2, 3] * 100)})
X["A"] = X["A"].astype('category')
X["B"] = X["B"].astype('category')
y = np.random.permutation([0, 1] * 150)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1
}
gbm = template.test_template(params=params, X_y=(X, y), return_model=True)
print("----------------------------------------------------------------------")
print("running test_engine.py")
......
......@@ -12,42 +12,44 @@ from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
feval=mean_squared_error, num_round=100,
custom_obj=None, predict_proba=False,
return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data:
return X_train, X_test, y_train, y_test
arguments = {'n_estimators': num_round, 'silent': True}
if custom_obj:
arguments['objective'] = custom_obj
gbm = model(**arguments)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model:
return gbm
elif predict_proba:
return feval(y_test, gbm.predict_proba(X_test))
else:
return feval(y_test, gbm.predict(X_test))
class template(object):
@staticmethod
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
feval=mean_squared_error, num_round=100,
custom_obj=None, predict_proba=False,
return_data=False, return_model=False):
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data:
return X_train, X_test, y_train, y_test
arguments = {'n_estimators': num_round, 'silent': True}
if custom_obj:
arguments['objective'] = custom_obj
gbm = model(**arguments)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
if return_model:
return gbm
elif predict_proba:
return feval(y_test, gbm.predict_proba(X_test))
else:
return feval(y_test, gbm.predict(X_test))
class TestSklearn(unittest.TestCase):
def test_binary(self):
X_y = load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
ret = template.test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15)
def test_regreesion(self):
self.assertLess(test_template() ** 0.5, 4)
self.assertLess(template.test_template() ** 0.5, 4)
def test_multiclass(self):
X_y = load_digits(10, True)
def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error)
ret = template.test_template(X_y, lgb.LGBMClassifier, multi_error)
self.assertLess(ret, 0.2)
def test_lambdarank(self):
......@@ -68,7 +70,7 @@ class TestSklearn(unittest.TestCase):
grad = (y_pred - y_true)
hess = np.ones(len(y_true))
return grad, hess
ret = test_template(custom_obj=objective_ls)
ret = template.test_template(custom_obj=objective_ls)
self.assertLess(ret, 100)
def test_binary_classification_with_custom_objective(self):
......@@ -81,17 +83,17 @@ class TestSklearn(unittest.TestCase):
def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
ret = template.test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
self.assertLess(ret, 0.1)
def test_dart(self):
X_train, X_test, y_train, y_test = test_template(return_data=True)
X_train, X_test, y_train, y_test = template.test_template(return_data=True)
gbm = lgb.LGBMRegressor(boosting_type='dart')
gbm.fit(X_train, y_train)
self.assertLessEqual(gbm.score(X_train, y_train), 1.)
def test_grid_search(self):
X_train, X_test, y_train, y_test = test_template(return_data=True)
X_train, X_test, y_train, y_test = template.test_template(return_data=True)
params = {'boosting_type': ['dart', 'gbdt'],
'n_estimators': [15, 20],
'drop_rate': [0.1, 0.2]}
......@@ -100,27 +102,29 @@ class TestSklearn(unittest.TestCase):
self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
def test_clone_and_property(self):
gbm = test_template(return_model=True)
gbm = template.test_template(return_model=True)
gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importance_, np.ndarray)
clf = test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
clf = template.test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
self.assertListEqual(sorted(clf.classes_), [0, 1])
self.assertEqual(clf.n_classes_, 2)
self.assertIsInstance(clf.booster_, lgb.Booster)
self.assertIsInstance(clf.feature_importance_, np.ndarray)
def test_joblib(self):
gbm = test_template(num_round=10, return_model=True)
gbm = template.test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_))
X_train, X_test, y_train, y_test = test_template(return_data=True)
X_train, X_test, y_train, y_test = template.test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result_, gbm_pickle.evals_result_)
for key in gbm.evals_result_:
for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
self.assertAlmostEqual(*evals, places=5)
pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment