Commit 57d55272 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

fix bug for pandas auto categorical_feature (#218)

* fix bug for categorical_feature

* add test on load model with categorical feature

* add unseen category in test dataset

* save/load pandas_categorical to model

* fix logic

* cast pandas columns to string

* add load pandas_categorical from file to _InnerPredictor init
parent adb8fb4e
...@@ -213,6 +213,64 @@ def c_int_array(data): ...@@ -213,6 +213,64 @@ def c_int_array(data):
return (ptr_data, type_data) return (ptr_data, type_data)
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
if feature_name == 'auto' or feature_name is None:
data.rename(columns=str, inplace=True)
cat_cols = data.select_dtypes(include=['category']).columns
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
else:
categorical_feature += list(cat_cols)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float')
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float')
return label
class _InnerPredictor(object): class _InnerPredictor(object):
""" """
A _InnerPredictor of LightGBM. A _InnerPredictor of LightGBM.
...@@ -244,6 +302,12 @@ class _InnerPredictor(object): ...@@ -244,6 +302,12 @@ class _InnerPredictor(object):
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.num_class = out_num_class.value self.num_class = out_num_class.value
self.num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
with open(model_file, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
self.pandas_categorical = eval(last_line[len('pandas_categorical:'):])
else:
self.pandas_categorical = None
elif booster_handle is not None: elif booster_handle is not None:
self.__is_manage_handle = False self.__is_manage_handle = False
self.handle = booster_handle self.handle = booster_handle
...@@ -257,6 +321,7 @@ class _InnerPredictor(object): ...@@ -257,6 +321,7 @@ class _InnerPredictor(object):
self.handle, self.handle,
ctypes.byref(out_num_iterations))) ctypes.byref(out_num_iterations)))
self.num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = None
else: else:
raise TypeError('Need Model file or Booster handle to create a predictor') raise TypeError('Need Model file or Booster handle to create a predictor')
...@@ -292,6 +357,7 @@ class _InnerPredictor(object): ...@@ -292,6 +357,7 @@ class _InnerPredictor(object):
""" """
if isinstance(data, Dataset): if isinstance(data, Dataset):
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead") raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE predict_type = C_API_PREDICT_RAW_SCORE
...@@ -448,62 +514,6 @@ class _InnerPredictor(object): ...@@ -448,62 +514,6 @@ class _InnerPredictor(object):
return preds, nrow return preds, nrow
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
cat_cols = data.select_dtypes(include=['category']).columns
if not pandas_categorical: # train dataset
pandas_categorical = (data[col].cat.categories for col in cat_cols)
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if data[col].cat.categories != category:
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = data.columns
if categorical_feature == 'auto':
categorical_feature = cat_cols
else:
categorical_feature += cat_cols
if feature_name == 'auto':
feature_name = data.columns
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float')
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float')
return label
class Dataset(object): class Dataset(object):
"""Dataset in LightGBM.""" """Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None, def __init__(self, data, label=None, max_bin=255, reference=None,
...@@ -976,10 +986,10 @@ class Dataset(object): ...@@ -976,10 +986,10 @@ class Dataset(object):
Feature names Feature names
""" """
self.feature_name = feature_name self.feature_name = feature_name
if self.handle is not None and feature_name is not None: if self.handle is not None and feature_name is not None and feature_name != 'auto':
if len(feature_name) != self.num_feature(): if len(feature_name) != self.num_feature():
raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature())) raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
c_feature_name = [c_str(str(name)) for name in feature_name] c_feature_name = [c_str(name) for name in feature_name]
_safe_call(_LIB.LGBM_DatasetSetFeatureNames( _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
self.handle, self.handle,
c_array(ctypes.c_char_p, c_feature_name), c_array(ctypes.c_char_p, c_feature_name),
...@@ -1184,6 +1194,7 @@ class Booster(object): ...@@ -1184,6 +1194,7 @@ class Booster(object):
self.__inner_predict_buffer = [None] self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False] self.__is_predicted_cur_iter = [False]
self.__get_eval_info() self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
elif model_file is not None: elif model_file is not None:
"""Prediction task""" """Prediction task"""
out_num_iterations = ctypes.c_int(0) out_num_iterations = ctypes.c_int(0)
...@@ -1196,6 +1207,12 @@ class Booster(object): ...@@ -1196,6 +1207,12 @@ class Booster(object):
self.handle, self.handle,
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value self.__num_class = out_num_class.value
with open(model_file, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
self.pandas_categorical = eval(last_line[len('pandas_categorical:'):])
else:
self.pandas_categorical = None
else: else:
raise TypeError('Need at least one training dataset or model file to create booster instance') raise TypeError('Need at least one training dataset or model file to create booster instance')
...@@ -1452,6 +1469,8 @@ class Booster(object): ...@@ -1452,6 +1469,8 @@ class Booster(object):
self.handle, self.handle,
ctypes.c_int(num_iteration), ctypes.c_int(num_iteration),
c_str(filename))) c_str(filename)))
with open(filename, 'a') as f:
f.write('\npandas_categorical:' + repr(self.pandas_categorical))
def dump_model(self, num_iteration=-1): def dump_model(self, num_iteration=-1):
""" """
...@@ -1515,7 +1534,7 @@ class Booster(object): ...@@ -1515,7 +1534,7 @@ class Booster(object):
------- -------
Prediction result Prediction result
""" """
predictor = _InnerPredictor(booster_handle=self.handle) predictor = self._to_predictor()
if num_iteration <= 0: if num_iteration <= 0:
num_iteration = self.best_iteration num_iteration = self.best_iteration
return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape) return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
...@@ -1524,6 +1543,7 @@ class Booster(object): ...@@ -1524,6 +1543,7 @@ class Booster(object):
"""Convert to predictor """Convert to predictor
""" """
predictor = _InnerPredictor(booster_handle=self.handle) predictor = _InnerPredictor(booster_handle=self.handle)
predictor.pandas_categorical = self.pandas_categorical
return predictor return predictor
def feature_importance(self, importance_type='split'): def feature_importance(self, importance_type='split'):
......
...@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
init_iteration = predictor.num_total_iteration if predictor is not None else 0 init_iteration = predictor.num_total_iteration if predictor is not None else 0
"""check dataset""" """check dataset"""
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError("Traninig only accepts Dataset object") raise TypeError("Training only accepts Dataset object")
train_set._update_params(params) train_set._update_params(params)
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
......
...@@ -151,12 +151,38 @@ class TestEngine(unittest.TestCase): ...@@ -151,12 +151,38 @@ class TestEngine(unittest.TestCase):
X["A"] = X["A"].astype('category') X["A"] = X["A"].astype('category')
X["B"] = X["B"].astype('category') X["B"] = X["B"].astype('category')
y = np.random.permutation([0, 1] * 150) y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
"B": np.random.permutation([1, 3] * 30)})
X_test["A"] = X_test["A"].astype('category')
X_test["B"] = X_test["B"].astype('category')
params = { params = {
'objective': 'binary', 'objective': 'binary',
'metric': 'binary_logloss', 'metric': 'binary_logloss',
'verbose': -1 'verbose': -1
} }
gbm = template.test_template(params=params, X_y=(X, y), return_model=True) lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
pred0 = list(gbm0.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=[0])
pred1 = list(gbm1.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A'])
pred2 = list(gbm2.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A', 'B'])
pred3 = list(gbm3.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = list(gbm4.predict(X_test))
self.assertListEqual(pred0, pred1)
self.assertListEqual(pred0, pred2)
self.assertListEqual(pred0, pred3)
self.assertListEqual(pred0, pred4)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment