Unverified Commit d115769c authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] ignore pandas ordered categorical columns by default (#2115)

* ignore pandas ordered categorical columns by default

* fix tests

* fix tests

* added comments
parent 89f2021a
...@@ -258,7 +258,8 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -258,7 +258,8 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError('Input data must be 2 dimensional and non empty.') raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None: if feature_name == 'auto' or feature_name is None:
data = data.rename(columns=str) data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns cat_cols = list(data.select_dtypes(include=['category']).columns)
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else: else:
...@@ -267,26 +268,25 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -267,26 +268,25 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
for col, category in zip_(cat_cols, pandas_categorical): for col, category in zip_(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category): if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category) data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object if len(cat_cols): # cat_cols is list
data = data.copy() # not alter origin DataFrame data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature is not None: if categorical_feature is not None:
if feature_name is None: if feature_name is None:
feature_name = list(data.columns) feature_name = list(data.columns)
if categorical_feature == 'auto': if categorical_feature == 'auto':
categorical_feature = list(cat_cols) categorical_feature = cat_cols_not_ordered
else: else:
categorical_feature = list(categorical_feature) + list(cat_cols) categorical_feature = list(categorical_feature) + cat_cols_not_ordered
if feature_name == 'auto': if feature_name == 'auto':
feature_name = list(data.columns) feature_name = list(data.columns)
data_dtypes = data.dtypes data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes): if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER] enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
msg = ("DataFrame.dtypes for data must be int, float or bool.\n" "Did not expect the data types in fields "
"Did not expect the data types in fields ") + ', '.join(bad_fields))
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float') data = data.values.astype('float')
else: else:
if feature_name == 'auto': if feature_name == 'auto':
...@@ -686,7 +686,7 @@ class Dataset(object): ...@@ -686,7 +686,7 @@ class Dataset(object):
Categorical features. Categorical features.
If list of int, interpreted as indices. If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used. If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647). All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero. Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values. All negative values in categorical features will be treated as missing values.
......
...@@ -56,7 +56,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -56,7 +56,7 @@ def train(params, train_set, num_boost_round=100,
Categorical features. Categorical features.
If list of int, interpreted as indices. If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used. If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647). All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero. Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values. All negative values in categorical features will be treated as missing values.
...@@ -391,7 +391,7 @@ def cv(params, train_set, num_boost_round=100, ...@@ -391,7 +391,7 @@ def cv(params, train_set, num_boost_round=100,
Categorical features. Categorical features.
If list of int, interpreted as indices. If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used. If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647). All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero. Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values. All negative values in categorical features will be treated as missing values.
......
...@@ -392,7 +392,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -392,7 +392,7 @@ class LGBMModel(_LGBMModelBase):
Categorical features. Categorical features.
If list of int, interpreted as indices. If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well). If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used. If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647). All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero. Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values. All negative values in categorical features will be treated as missing values.
......
...@@ -553,39 +553,42 @@ class TestEngine(unittest.TestCase): ...@@ -553,39 +553,42 @@ class TestEngine(unittest.TestCase):
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self): def test_pandas_categorical(self):
import pandas as pd import pandas as pd
np.random.seed(42) # sometimes there is no difference how E col is treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int "B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool "D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150) y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30), "B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)}) "D": np.random.permutation([True, False] * 30),
cat_cols = [] "E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
for col in ["A", "B", "C", "D"]: ordered=True)})
X[col] = X[col].astype('category') np.random.seed() # reset seed
X_test[col] = X_test[col].astype('category') cat_cols_actual = ["A", "B", "C", "D"]
cat_cols.append(X[col].cat.categories.tolist()) cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
params = { params = {
'objective': 'binary', 'objective': 'binary',
'metric': 'binary_logloss', 'metric': 'binary_logloss',
'verbose': -1 'verbose': -1
} }
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False) gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
pred0 = gbm0.predict(X_test) pred0 = gbm0.predict(X_test)
lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
categorical_feature=[0])
pred1 = gbm1.predict(X_test) pred1 = gbm1.predict(X_test)
lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A'])
categorical_feature=['A'])
pred2 = gbm2.predict(X_test) pred2 = gbm2.predict(X_test)
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D'])
categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test) pred3 = gbm3.predict(X_test)
gbm3.save_model('categorical.model') gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model') gbm4 = lgb.Booster(model_file='categorical.model')
...@@ -595,18 +598,25 @@ class TestEngine(unittest.TestCase): ...@@ -595,18 +598,25 @@ class TestEngine(unittest.TestCase):
pred5 = gbm4.predict(X_test) pred5 = gbm4.predict(X_test)
gbm5 = lgb.Booster(model_str=model_str) gbm5 = lgb.Booster(model_str=model_str)
pred6 = gbm5.predict(X_test) pred6 = gbm5.predict(X_test)
lgb_train = lgb.Dataset(X, y)
gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['E'])
pred7 = gbm6.predict(X_test)
np.testing.assert_almost_equal(pred0, pred1) np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2) np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3) np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred0, pred4) np.testing.assert_almost_equal(pred0, pred4)
np.testing.assert_almost_equal(pred0, pred5) np.testing.assert_almost_equal(pred0, pred5)
np.testing.assert_almost_equal(pred0, pred6) np.testing.assert_almost_equal(pred0, pred6)
self.assertListEqual(gbm0.pandas_categorical, cat_cols) self.assertRaises(AssertionError,
self.assertListEqual(gbm1.pandas_categorical, cat_cols) np.testing.assert_almost_equal,
self.assertListEqual(gbm2.pandas_categorical, cat_cols) pred0, pred7) # ordered cat features aren't treated as cat features by default
self.assertListEqual(gbm3.pandas_categorical, cat_cols) self.assertListEqual(gbm0.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_cols) self.assertListEqual(gbm1.pandas_categorical, cat_values)
self.assertListEqual(gbm5.pandas_categorical, cat_cols) self.assertListEqual(gbm2.pandas_categorical, cat_values)
self.assertListEqual(gbm3.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_values)
self.assertListEqual(gbm5.pandas_categorical, cat_values)
self.assertListEqual(gbm6.pandas_categorical, cat_values)
def test_reference_chain(self): def test_reference_chain(self):
X = np.random.normal(size=(100, 2)) X = np.random.normal(size=(100, 2))
......
...@@ -206,22 +206,29 @@ class TestSklearn(unittest.TestCase): ...@@ -206,22 +206,29 @@ class TestSklearn(unittest.TestCase):
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self): def test_pandas_categorical(self):
import pandas as pd import pandas as pd
np.random.seed(42) # sometimes there is no difference how E col is treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int "B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool "D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150) y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30), "B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)}) "D": np.random.permutation([True, False] * 30),
cat_cols = [] "E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
for col in ["A", "B", "C", "D"]: ordered=True)})
X[col] = X[col].astype('category') np.random.seed() # reset seed
X_test[col] = X_test[col].astype('category') cat_cols_actual = ["A", "B", "C", "D"]
cat_cols.append(X[col].cat.categories.tolist()) cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y) gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
pred0 = gbm0.predict(X_test) pred0 = gbm0.predict(X_test)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier().fit(X, pd.Series(y), categorical_feature=[0]) gbm1 = lgb.sklearn.LGBMClassifier().fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test) pred1 = gbm1.predict(X_test)
gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A']) gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
...@@ -231,16 +238,21 @@ class TestSklearn(unittest.TestCase): ...@@ -231,16 +238,21 @@ class TestSklearn(unittest.TestCase):
gbm3.booster_.save_model('categorical.model') gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model') gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test) pred4 = gbm4.predict(X_test)
pred_prob = gbm0.predict_proba(X_test)[:, 1] gbm5 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['E'])
pred5 = gbm5.predict(X_test)
np.testing.assert_almost_equal(pred0, pred1) np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2) np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3) np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred_prob, pred4) np.testing.assert_almost_equal(pred_prob, pred4)
self.assertListEqual(gbm0.booster_.pandas_categorical, cat_cols) self.assertRaises(AssertionError,
self.assertListEqual(gbm1.booster_.pandas_categorical, cat_cols) np.testing.assert_almost_equal,
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_cols) pred0, pred5) # ordered cat features aren't treated as cat features by default
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_cols) self.assertListEqual(gbm0.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_cols) self.assertListEqual(gbm1.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_values)
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
def test_predict(self): def test_predict(self):
iris = load_iris() iris = load_iris()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment