Commit 6c248d37 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

suppprt pandas categorical (#193)

* suppprt pandas categorical

* refine logic

* make default=auto

* fix train/valid categorical codes

* add test

* unify set _predictor

* fix tests

* fix warning

* support feature_name=int
parent 00e5b244
...@@ -14,7 +14,7 @@ before_install: ...@@ -14,7 +14,7 @@ before_install:
install: install:
- sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential - sudo apt-get install -y libopenmpi-dev openmpi-bin build-essential
- conda install --yes atlas numpy scipy scikit-learn - conda install --yes atlas numpy scipy scikit-learn pandas
- pip install pep8 - pip install pep8
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
- [Booster](Python-API.md#booster) - [Booster](Python-API.md#booster)
* [Training API](Python-API.md#training-api) * [Training API](Python-API.md#training-api)
- [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_namenone-categorical_featurenone-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone) - [train](Python-API.md#trainparams-train_set-num_boost_round100-valid_setsnone-valid_namesnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-evals_resultnone-verbose_evaltrue-learning_ratesnone-callbacksnone)
- [cv](Python-API.md#cvparams-train_set-num_boost_round10-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_namenone-categorical_featurenone-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone) - [cv](Python-API.md#cvparams-train_set-num_boost_round10-nfold5-stratifiedfalse-shuffletrue-metricsnone-fobjnone-fevalnone-init_modelnone-feature_nameauto-categorical_featureauto-early_stopping_roundsnone-fpreprocnone-verbose_evalnone-show_stdvtrue-seed0-callbacksnone)
* [Scikit-learn API](Python-API.md#scikit-learn-api) * [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods) - [Common Methods](Python-API.md#common-methods)
...@@ -31,7 +31,7 @@ The methods of each Class is in alphabetical order. ...@@ -31,7 +31,7 @@ The methods of each Class is in alphabetical order.
###Dataset ###Dataset
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name=None, categorical_feature=None, params=None, free_raw_data=True) ####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name='auto', categorical_feature='auto', params=None, free_raw_data=True)
Parameters Parameters
---------- ----------
...@@ -50,12 +50,14 @@ The methods of each Class is in alphabetical order. ...@@ -50,12 +50,14 @@ The methods of each Class is in alphabetical order.
Group/query size for dataset Group/query size for dataset
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or list of int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params : dict, optional params : dict, optional
Other parameters Other parameters
free_raw_data : Bool free_raw_data : Bool
...@@ -445,7 +447,7 @@ The methods of each Class is in alphabetical order. ...@@ -445,7 +447,7 @@ The methods of each Class is in alphabetical order.
##Training API ##Training API
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name=None, categorical_feature=None, early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None) ####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
Train with given parameters. Train with given parameters.
...@@ -468,12 +470,14 @@ The methods of each Class is in alphabetical order. ...@@ -468,12 +470,14 @@ The methods of each Class is in alphabetical order.
Note: should return (eval_name, eval_result, is_higher_better) of list of this Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or list of int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -513,7 +517,7 @@ The methods of each Class is in alphabetical order. ...@@ -513,7 +517,7 @@ The methods of each Class is in alphabetical order.
booster : a trained booster model booster : a trained booster model
####cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name=None, categorical_feature=None, early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None) ####cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', categorical_feature='auto', early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
Cross-validation with given paramaters. Cross-validation with given paramaters.
...@@ -541,11 +545,14 @@ The methods of each Class is in alphabetical order. ...@@ -541,11 +545,14 @@ The methods of each Class is in alphabetical order.
Custom evaluation function. Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
Categorical features, type int represents index, categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
...@@ -693,7 +700,7 @@ The methods of each Class is in alphabetical order. ...@@ -693,7 +700,7 @@ The methods of each Class is in alphabetical order.
X_leaves : array_like, shape=[n_samples, n_trees] X_leaves : array_like, shape=[n_samples, n_trees]
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None) ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
Fit the gradient boosting model. Fit the gradient boosting model.
...@@ -724,12 +731,14 @@ The methods of each Class is in alphabetical order. ...@@ -724,12 +731,14 @@ The methods of each Class is in alphabetical order.
early_stopping_rounds : int early_stopping_rounds : int
verbose : bool verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well). type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at each iteration. List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information. See Callbacks in Python-API.md for more information.
...@@ -823,7 +832,7 @@ The methods of each Class is in alphabetical order. ...@@ -823,7 +832,7 @@ The methods of each Class is in alphabetical order.
###LGBMRanker ###LGBMRanker
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None) ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric='ndcg', eval_at=1, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None)
Most arguments are same as Common Methods except: Most arguments are same as Common Methods except:
......
...@@ -454,8 +454,29 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', ...@@ -454,8 +454,29 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32': 'float', 'float64': 'float', 'bool': 'int'} 'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data): def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
cat_cols = data.select_dtypes(include=['category']).columns
if not pandas_categorical: # train dataset
pandas_categorical = (data[col].cat.categories for col in cat_cols)
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if data[col].cat.categories != category:
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = data.columns
if categorical_feature == 'auto':
categorical_feature = cat_cols
else:
categorical_feature += cat_cols
if feature_name == 'auto':
feature_name = data.columns
data_dtypes = data.dtypes data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes): if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in bad_fields = [data.columns[i] for i, dtype in
...@@ -464,7 +485,12 @@ def _data_from_pandas(data): ...@@ -464,7 +485,12 @@ def _data_from_pandas(data):
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """ msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields)) raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float') data = data.values.astype('float')
return data else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label): def _label_from_pandas(label):
...@@ -482,7 +508,7 @@ class Dataset(object): ...@@ -482,7 +508,7 @@ class Dataset(object):
"""Dataset in LightGBM.""" """Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None, def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False, weight=None, group=None, silent=False,
feature_name=None, categorical_feature=None, params=None, feature_name='auto', categorical_feature='auto', params=None,
free_raw_data=True): free_raw_data=True):
""" """
Parameters Parameters
...@@ -502,12 +528,14 @@ class Dataset(object): ...@@ -502,12 +528,14 @@ class Dataset(object):
Group/query size for dataset Group/query size for dataset
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional params: dict, optional
Other parameters Other parameters
free_raw_data: Bool free_raw_data: Bool
...@@ -527,6 +555,7 @@ class Dataset(object): ...@@ -527,6 +555,7 @@ class Dataset(object):
self.free_raw_data = free_raw_data self.free_raw_data = free_raw_data
self.used_indices = None self.used_indices = None
self._predictor = None self._predictor = None
self.pandas_categorical = None
def __del__(self): def __del__(self):
self._free_handle() self._free_handle()
...@@ -538,12 +567,12 @@ class Dataset(object): ...@@ -538,12 +567,12 @@ class Dataset(object):
def _lazy_init(self, data, label=None, max_bin=255, reference=None, def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None, weight=None, group=None, predictor=None,
silent=False, feature_name=None, silent=False, feature_name='auto',
categorical_feature=None, params=None): categorical_feature='auto', params=None):
if data is None: if data is None:
self.handle = None self.handle = None
return return
data = _data_from_pandas(data) data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
label = _label_from_pandas(label) label = _label_from_pandas(label)
self.data_has_header = False self.data_has_header = False
"""process for args""" """process for args"""
...@@ -760,7 +789,8 @@ class Dataset(object): ...@@ -760,7 +789,8 @@ class Dataset(object):
ret = Dataset(data, label=label, max_bin=self.max_bin, reference=self, ret = Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, silent=silent, params=params, weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data) free_raw_data=self.free_raw_data)
ret._set_predictor(self._predictor) ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret return ret
def subset(self, used_indices, params=None): def subset(self, used_indices, params=None):
...@@ -777,6 +807,7 @@ class Dataset(object): ...@@ -777,6 +807,7 @@ class Dataset(object):
ret = Dataset(None, reference=self, feature_name=self.feature_name, ret = Dataset(None, reference=self, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=params) categorical_feature=self.categorical_feature, params=params)
ret._predictor = self._predictor ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices ret.used_indices = used_indices
return ret return ret
...@@ -948,7 +979,7 @@ class Dataset(object): ...@@ -948,7 +979,7 @@ class Dataset(object):
if self.handle is not None and feature_name is not None: if self.handle is not None and feature_name is not None:
if len(feature_name) != self.num_feature(): if len(feature_name) != self.num_feature():
raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature())) raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
c_feature_name = [c_str(name) for name in feature_name] c_feature_name = [c_str(str(name)) for name in feature_name]
_safe_call(_LIB.LGBM_DatasetSetFeatureNames( _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
self.handle, self.handle,
c_array(ctypes.c_char_p, c_feature_name), c_array(ctypes.c_char_p, c_feature_name),
......
...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types, ...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100, def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None, valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None, feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
""" """
...@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100, ...@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -267,7 +269,7 @@ def _agg_cv_result(raw_results): ...@@ -267,7 +269,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, shuffle=True, metrics=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None, feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None): callbacks=None):
...@@ -298,11 +300,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -298,11 +300,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Custom evaluation function. Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
Categorical features, type int represents index, categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
......
...@@ -280,7 +280,7 @@ class LGBMModel(LGBMModelBase): ...@@ -280,7 +280,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name='auto', categorical_feature='auto',
callbacks=None): callbacks=None):
""" """
Fit the gradient boosting model Fit the gradient boosting model
...@@ -311,12 +311,14 @@ class LGBMModel(LGBMModelBase): ...@@ -311,12 +311,14 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds : int early_stopping_rounds : int
verbose : bool verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at each iteration. List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information. See Callbacks in Python-API.md for more information.
...@@ -506,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -506,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None, eval_init_score=None,
eval_metric="l2", eval_metric="l2",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, callbacks=None): feature_name='auto', categorical_feature='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight, super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set, init_score=init_score, eval_set=eval_set,
...@@ -552,7 +554,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -552,7 +554,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None, eval_init_score=None,
eval_metric="binary_logloss", eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name='auto', categorical_feature='auto',
callbacks=None): callbacks=None):
self._le = LGBMLabelEncoder().fit(y) self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y) y = self._le.transform(y)
...@@ -653,7 +655,7 @@ class LGBMRanker(LGBMModel): ...@@ -653,7 +655,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1, eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name='auto', categorical_feature='auto',
callbacks=None): callbacks=None):
""" """
Most arguments like common methods except following: Most arguments like common methods except following:
......
...@@ -12,9 +12,15 @@ from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, ...@@ -12,9 +12,15 @@ from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
try:
import pandas as pd
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
try: try:
import cPickle as pickle import cPickle as pickle
except: except ImportError:
import pickle import pickle
...@@ -22,31 +28,33 @@ def multi_logloss(y_true, y_pred): ...@@ -22,31 +28,33 @@ def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params={'objective': 'regression', 'metric': 'l2'}, class template(object):
X_y=load_boston(True), feval=mean_squared_error, @staticmethod
num_round=100, init_model=None, custom_eval=None, def test_template(params={'objective': 'regression', 'metric': 'l2'},
early_stopping_rounds=10, X_y=load_boston(True), feval=mean_squared_error,
return_data=False, return_model=False): num_round=100, init_model=None, custom_eval=None,
params['verbose'], params['seed'] = -1, 42 early_stopping_rounds=10,
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) return_data=False, return_model=False):
lgb_train = lgb.Dataset(X_train, y_train, params=params) params['verbose'], params['seed'] = -1, 42
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data: lgb_train = lgb.Dataset(X_train, y_train, params=params)
return lgb_train, lgb_eval lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
evals_result = {} if return_data:
gbm = lgb.train(params, lgb_train, return lgb_train, lgb_eval
num_boost_round=num_round, evals_result = {}
valid_sets=lgb_eval, gbm = lgb.train(params, lgb_train,
valid_names='eval', num_boost_round=num_round,
verbose_eval=False, valid_sets=lgb_eval,
feval=custom_eval, valid_names='eval',
evals_result=evals_result, verbose_eval=False,
early_stopping_rounds=early_stopping_rounds, feval=custom_eval,
init_model=init_model) evals_result=evals_result,
if return_model: early_stopping_rounds=early_stopping_rounds,
return gbm init_model=init_model)
else: if return_model:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration)) return gbm
else:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestEngine(unittest.TestCase): class TestEngine(unittest.TestCase):
...@@ -57,12 +65,12 @@ class TestEngine(unittest.TestCase): ...@@ -57,12 +65,12 @@ class TestEngine(unittest.TestCase):
'objective': 'binary', 'objective': 'binary',
'metric': 'binary_logloss' 'metric': 'binary_logloss'
} }
evals_result, ret = test_template(params, X_y, log_loss) evals_result, ret = template.test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5)
def test_regreesion(self): def test_regreesion(self):
evals_result, ret = test_template() evals_result, ret = template.test_template()
ret **= 0.5 ret **= 0.5
self.assertLess(ret, 4) self.assertLess(ret, 4)
self.assertAlmostEqual(min(evals_result['eval']['l2']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['l2']), ret, places=5)
...@@ -74,7 +82,7 @@ class TestEngine(unittest.TestCase): ...@@ -74,7 +82,7 @@ class TestEngine(unittest.TestCase):
'metric': 'multi_logloss', 'metric': 'multi_logloss',
'num_class': 10 'num_class': 10
} }
evals_result, ret = test_template(params, X_y, multi_logloss) evals_result, ret = template.test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
...@@ -84,11 +92,11 @@ class TestEngine(unittest.TestCase): ...@@ -84,11 +92,11 @@ class TestEngine(unittest.TestCase):
'metric': 'l1' 'metric': 'l1'
} }
model_name = 'model.txt' model_name = 'model.txt'
gbm = test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1) gbm = template.test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1)
gbm.save_model(model_name) gbm.save_model(model_name)
evals_result, ret = test_template(params, feval=mean_absolute_error, evals_result, ret = template.test_template(params, feval=mean_absolute_error,
num_round=80, init_model=model_name, num_round=80, init_model=model_name,
custom_eval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False))) custom_eval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)))
self.assertLess(ret, 3) self.assertLess(ret, 3)
self.assertAlmostEqual(min(evals_result['eval']['l1']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['l1']), ret, places=5)
for l1, mae in zip(evals_result['eval']['l1'], evals_result['eval']['mae']): for l1, mae in zip(evals_result['eval']['l1'], evals_result['eval']['mae']):
...@@ -104,38 +112,52 @@ class TestEngine(unittest.TestCase): ...@@ -104,38 +112,52 @@ class TestEngine(unittest.TestCase):
'metric': 'multi_logloss', 'metric': 'multi_logloss',
'num_class': 3 'num_class': 3
} }
gbm = test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1) gbm = template.test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1)
evals_result, ret = test_template(params, X_y, feval=multi_logloss, evals_result, ret = template.test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm) num_round=80, init_model=gbm)
self.assertLess(ret, 1.5) self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self): def test_cv(self):
lgb_train, _ = test_template(return_data=True) lgb_train, _ = template.test_template(return_data=True)
lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5,
metrics='l1', verbose_eval=False, metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
def test_save_load_copy_pickle(self): def test_save_load_copy_pickle(self):
gbm = test_template(num_round=20, return_model=True) gbm = template.test_template(num_round=20, return_model=True)
_, ret_origin = test_template(init_model=gbm) _, ret_origin = template.test_template(init_model=gbm)
other_ret = [] other_ret = []
gbm.save_model('lgb.model') gbm.save_model('lgb.model')
other_ret.append(test_template(init_model='lgb.model')[1]) other_ret.append(template.test_template(init_model='lgb.model')[1])
gbm_load = lgb.Booster(model_file='lgb.model') gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(test_template(init_model=gbm_load)[1]) other_ret.append(template.test_template(init_model=gbm_load)[1])
other_ret.append(test_template(init_model=copy.copy(gbm))[1]) other_ret.append(template.test_template(init_model=copy.copy(gbm))[1])
other_ret.append(test_template(init_model=copy.deepcopy(gbm))[1]) other_ret.append(template.test_template(init_model=copy.deepcopy(gbm))[1])
with open('lgb.pkl', 'wb') as f: with open('lgb.pkl', 'wb') as f:
pickle.dump(gbm, f) pickle.dump(gbm, f)
with open('lgb.pkl', 'rb') as f: with open('lgb.pkl', 'rb') as f:
gbm_pickle = pickle.load(f) gbm_pickle = pickle.load(f)
other_ret.append(test_template(init_model=gbm_pickle)[1]) other_ret.append(template.test_template(init_model=gbm_pickle)[1])
gbm_pickles = pickle.loads(pickle.dumps(gbm)) gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(test_template(init_model=gbm_pickles)[1]) other_ret.append(template.test_template(init_model=gbm_pickles)[1])
for ret in other_ret: for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5) self.assertAlmostEqual(ret_origin, ret, places=5)
@unittest.skipIf(not IS_PANDAS_INSTALLED, 'pandas not installed')
def test_pandas_categorical(self):
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),
"B": np.random.permutation([1, 2, 3] * 100)})
X["A"] = X["A"].astype('category')
X["B"] = X["B"].astype('category')
y = np.random.permutation([0, 1] * 150)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1
}
gbm = template.test_template(params=params, X_y=(X, y), return_model=True)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_engine.py") print("running test_engine.py")
......
...@@ -12,42 +12,44 @@ from sklearn.metrics import log_loss, mean_squared_error ...@@ -12,42 +12,44 @@ from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.model_selection import GridSearchCV, train_test_split
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor, class template(object):
feval=mean_squared_error, num_round=100, @staticmethod
custom_obj=None, predict_proba=False, def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
return_data=False, return_model=False): feval=mean_squared_error, num_round=100,
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) custom_obj=None, predict_proba=False,
if return_data: return_data=False, return_model=False):
return X_train, X_test, y_train, y_test X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
arguments = {'n_estimators': num_round, 'silent': True} if return_data:
if custom_obj: return X_train, X_test, y_train, y_test
arguments['objective'] = custom_obj arguments = {'n_estimators': num_round, 'silent': True}
gbm = model(**arguments) if custom_obj:
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False) arguments['objective'] = custom_obj
if return_model: gbm = model(**arguments)
return gbm gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
elif predict_proba: if return_model:
return feval(y_test, gbm.predict_proba(X_test)) return gbm
else: elif predict_proba:
return feval(y_test, gbm.predict(X_test)) return feval(y_test, gbm.predict_proba(X_test))
else:
return feval(y_test, gbm.predict(X_test))
class TestSklearn(unittest.TestCase): class TestSklearn(unittest.TestCase):
def test_binary(self): def test_binary(self):
X_y = load_breast_cancer(True) X_y = load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True) ret = template.test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
def test_regreesion(self): def test_regreesion(self):
self.assertLess(test_template() ** 0.5, 4) self.assertLess(template.test_template() ** 0.5, 4)
def test_multiclass(self): def test_multiclass(self):
X_y = load_digits(10, True) X_y = load_digits(10, True)
def multi_error(y_true, y_pred): def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred) return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error) ret = template.test_template(X_y, lgb.LGBMClassifier, multi_error)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
def test_lambdarank(self): def test_lambdarank(self):
...@@ -68,7 +70,7 @@ class TestSklearn(unittest.TestCase): ...@@ -68,7 +70,7 @@ class TestSklearn(unittest.TestCase):
grad = (y_pred - y_true) grad = (y_pred - y_true)
hess = np.ones(len(y_true)) hess = np.ones(len(y_true))
return grad, hess return grad, hess
ret = test_template(custom_obj=objective_ls) ret = template.test_template(custom_obj=objective_ls)
self.assertLess(ret, 100) self.assertLess(ret, 100)
def test_binary_classification_with_custom_objective(self): def test_binary_classification_with_custom_objective(self):
...@@ -81,17 +83,17 @@ class TestSklearn(unittest.TestCase): ...@@ -81,17 +83,17 @@ class TestSklearn(unittest.TestCase):
def binary_error(y_test, y_pred): def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)]) return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj) ret = template.test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
self.assertLess(ret, 0.1) self.assertLess(ret, 0.1)
def test_dart(self): def test_dart(self):
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = template.test_template(return_data=True)
gbm = lgb.LGBMRegressor(boosting_type='dart') gbm = lgb.LGBMRegressor(boosting_type='dart')
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
self.assertLessEqual(gbm.score(X_train, y_train), 1.) self.assertLessEqual(gbm.score(X_train, y_train), 1.)
def test_grid_search(self): def test_grid_search(self):
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = template.test_template(return_data=True)
params = {'boosting_type': ['dart', 'gbdt'], params = {'boosting_type': ['dart', 'gbdt'],
'n_estimators': [15, 20], 'n_estimators': [15, 20],
'drop_rate': [0.1, 0.2]} 'drop_rate': [0.1, 0.2]}
...@@ -100,27 +102,29 @@ class TestSklearn(unittest.TestCase): ...@@ -100,27 +102,29 @@ class TestSklearn(unittest.TestCase):
self.assertIn(gbm.best_params_['n_estimators'], [15, 20]) self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
def test_clone_and_property(self): def test_clone_and_property(self):
gbm = test_template(return_model=True) gbm = template.test_template(return_model=True)
gbm_clone = clone(gbm) gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster) self.assertIsInstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importance_, np.ndarray) self.assertIsInstance(gbm.feature_importance_, np.ndarray)
clf = test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True) clf = template.test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
self.assertListEqual(sorted(clf.classes_), [0, 1]) self.assertListEqual(sorted(clf.classes_), [0, 1])
self.assertEqual(clf.n_classes_, 2) self.assertEqual(clf.n_classes_, 2)
self.assertIsInstance(clf.booster_, lgb.Booster) self.assertIsInstance(clf.booster_, lgb.Booster)
self.assertIsInstance(clf.feature_importance_, np.ndarray) self.assertIsInstance(clf.feature_importance_, np.ndarray)
def test_joblib(self): def test_joblib(self):
gbm = test_template(num_round=10, return_model=True) gbm = template.test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl') joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl') gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster) self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params()) self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_)) self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_))
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = template.test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result_, gbm_pickle.evals_result_) for key in gbm.evals_result_:
for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
self.assertAlmostEqual(*evals, places=5)
pred_origin = gbm.predict(X_test) pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test) pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle)) self.assertEqual(len(pred_origin), len(pred_pickle))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment