Unverified Commit d115769c authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] ignore pandas ordered categorical columns by default (#2115)

* ignore pandas ordered categorical columns by default

* fix tests

* fix tests

* added comments
parent 89f2021a
......@@ -258,7 +258,8 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None:
data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
cat_cols = list(data.select_dtypes(include=['category']).columns)
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
......@@ -267,26 +268,25 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
for col, category in zip_(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
if len(cat_cols): # cat_cols is list
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
categorical_feature = cat_cols_not_ordered
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
categorical_feature = list(categorical_feature) + cat_cols_not_ordered
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = ("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in fields ")
raise ValueError(msg + ', '.join(bad_fields))
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in fields "
+ ', '.join(bad_fields))
data = data.values.astype('float')
else:
if feature_name == 'auto':
......@@ -686,7 +686,7 @@ class Dataset(object):
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
......
......@@ -56,7 +56,7 @@ def train(params, train_set, num_boost_round=100,
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
......@@ -391,7 +391,7 @@ def cv(params, train_set, num_boost_round=100,
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
......
......@@ -392,7 +392,7 @@ class LGBMModel(_LGBMModelBase):
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
......
......@@ -553,39 +553,42 @@ class TestEngine(unittest.TestCase):
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self):
import pandas as pd
np.random.seed(42) # sometimes there is no difference how E col is treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)})
cat_cols = []
for col in ["A", "B", "C", "D"]:
X[col] = X[col].astype('category')
X_test[col] = X_test[col].astype('category')
cat_cols.append(X[col].cat.categories.tolist())
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
ordered=True)})
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
pred0 = gbm0.predict(X_test)
lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=[0])
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
pred1 = gbm1.predict(X_test)
lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A'])
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A'])
pred2 = gbm2.predict(X_test)
lgb_train = lgb.Dataset(X, y)
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A', 'B', 'C', 'D'])
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test)
gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
......@@ -595,18 +598,25 @@ class TestEngine(unittest.TestCase):
pred5 = gbm4.predict(X_test)
gbm5 = lgb.Booster(model_str=model_str)
pred6 = gbm5.predict(X_test)
lgb_train = lgb.Dataset(X, y)
gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['E'])
pred7 = gbm6.predict(X_test)
np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred0, pred4)
np.testing.assert_almost_equal(pred0, pred5)
np.testing.assert_almost_equal(pred0, pred6)
self.assertListEqual(gbm0.pandas_categorical, cat_cols)
self.assertListEqual(gbm1.pandas_categorical, cat_cols)
self.assertListEqual(gbm2.pandas_categorical, cat_cols)
self.assertListEqual(gbm3.pandas_categorical, cat_cols)
self.assertListEqual(gbm4.pandas_categorical, cat_cols)
self.assertListEqual(gbm5.pandas_categorical, cat_cols)
self.assertRaises(AssertionError,
np.testing.assert_almost_equal,
pred0, pred7) # ordered cat features aren't treated as cat features by default
self.assertListEqual(gbm0.pandas_categorical, cat_values)
self.assertListEqual(gbm1.pandas_categorical, cat_values)
self.assertListEqual(gbm2.pandas_categorical, cat_values)
self.assertListEqual(gbm3.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_values)
self.assertListEqual(gbm5.pandas_categorical, cat_values)
self.assertListEqual(gbm6.pandas_categorical, cat_values)
def test_reference_chain(self):
X = np.random.normal(size=(100, 2))
......
......@@ -206,22 +206,29 @@ class TestSklearn(unittest.TestCase):
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self):
import pandas as pd
np.random.seed(42) # sometimes there is no difference how E col is treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)})
cat_cols = []
for col in ["A", "B", "C", "D"]:
X[col] = X[col].astype('category')
X_test[col] = X_test[col].astype('category')
cat_cols.append(X[col].cat.categories.tolist())
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
ordered=True)})
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
pred0 = gbm0.predict(X_test)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier().fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test)
gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
......@@ -231,16 +238,21 @@ class TestSklearn(unittest.TestCase):
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm5 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['E'])
pred5 = gbm5.predict(X_test)
np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred_prob, pred4)
self.assertListEqual(gbm0.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm1.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm4.pandas_categorical, cat_cols)
self.assertRaises(AssertionError,
np.testing.assert_almost_equal,
pred0, pred5) # ordered cat features aren't treated as cat features by default
self.assertListEqual(gbm0.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm1.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_values)
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
def test_predict(self):
iris = load_iris()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment