[python] ignore pandas ordered categorical columns by default (#2115)

* ignore pandas ordered categorical columns by default * fix tests * fix tests * added comments

[python] ignore pandas ordered categorical columns by default (#2115)
* ignore pandas ordered categorical columns by default * fix tests * fix tests * added comments
d115769c · Nikita Titov · GitHub · 89f2021a · d115769c · d115769c
Unverified Commit d115769c authored Apr 19, 2019 by Nikita Titov Committed by GitHub Apr 19, 2019
5 changed files
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -258,7 +258,8 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
            raise ValueError('Input data must be 2 dimensional and non empty.')
        if feature_name == 'auto' or feature_name is None:
            data = data.rename(columns=str)
-        cat_cols = data.select_dtypes(include=['category']).columns
+        cat_cols = list(data.select_dtypes(include=['category']).columns)
+        cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
        if pandas_categorical is None:  # train dataset
            pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
        else:
@@ -267,26 +268,25 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
            for col, category in zip_(cat_cols, pandas_categorical):
                if list(data[col].cat.categories) != list(category):
                    data[col] = data[col].cat.set_categories(category)
-        if len(cat_cols):  # cat_cols is pandas Index object
+        if len(cat_cols):  # cat_cols is list
            data = data.copy()  # not alter origin DataFrame
            data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
        if categorical_feature is not None:
            if feature_name is None:
                feature_name = list(data.columns)
            if categorical_feature == 'auto':
-                categorical_feature = list(cat_cols)
+                categorical_feature = cat_cols_not_ordered
            else:
-                categorical_feature = list(categorical_feature) + list(cat_cols)
+                categorical_feature = list(categorical_feature) + cat_cols_not_ordered
        if feature_name == 'auto':
            feature_name = list(data.columns)
        data_dtypes = data.dtypes
        if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
            bad_fields = [data.columns[i] for i, dtype in
                          enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
+            raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
-            msg = ("DataFrame.dtypes for data must be int, float or bool.\n"
+                             "Did not expect the data types in fields "
-                   "Did not expect the data types in fields ")
+                             + ', '.join(bad_fields))
-            raise ValueError(msg + ', '.join(bad_fields))
        data = data.values.astype('float')
    else:
        if feature_name == 'auto':
@@ -686,7 +686,7 @@ class Dataset(object):
            Categorical features.
            If list of int, interpreted as indices.
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
-            If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
+            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
            All values in categorical features should be less than int32 max value (2147483647).
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
            All negative values in categorical features will be treated as missing values.

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -56,7 +56,7 @@ def train(params, train_set, num_boost_round=100,
        Categorical features.
        If list of int, interpreted as indices.
        If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
-        If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
+        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
        All values in categorical features should be less than int32 max value (2147483647).
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
        All negative values in categorical features will be treated as missing values.
@@ -391,7 +391,7 @@ def cv(params, train_set, num_boost_round=100,
        Categorical features.
        If list of int, interpreted as indices.
        If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
-        If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
+        If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
        All values in categorical features should be less than int32 max value (2147483647).
        Large values could be memory consuming. Consider using consecutive integers starting from zero.
        All negative values in categorical features will be treated as missing values.

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -392,7 +392,7 @@ class LGBMModel(_LGBMModelBase):
            Categorical features.
            If list of int, interpreted as indices.
            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
-            If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
+            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
            All values in categorical features should be less than int32 max value (2147483647).
            Large values could be memory consuming. Consider using consecutive integers starting from zero.
            All negative values in categorical features will be treated as missing values.

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -553,39 +553,42 @@ class TestEngine(unittest.TestCase):
    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
    def test_pandas_categorical(self):
        import pandas as pd
+        np.random.seed(42)  # sometimes there is no difference how E col is treated (cat or not cat)
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
-                          "D": np.random.permutation([True, False] * 150)})  # bool
+                          "D": np.random.permutation([True, False] * 150),  # bool
+                          "E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
+                                              ordered=True)})  # str and ordered categorical
        y = np.random.permutation([0, 1] * 150)
-        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
+        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),  # unseen category
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
-                               "D": np.random.permutation([True, False] * 30)})
+                               "D": np.random.permutation([True, False] * 30),
-        cat_cols = []
+                               "E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
-        for col in ["A", "B", "C", "D"]:
+                                                   ordered=True)})
-            X[col] = X[col].astype('category')
+        np.random.seed()  # reset seed
-            X_test[col] = X_test[col].astype('category')
+        cat_cols_actual = ["A", "B", "C", "D"]
-            cat_cols.append(X[col].cat.categories.tolist())
+        cat_cols_to_store = cat_cols_actual + ["E"]
+        X[cat_cols_actual] = X[cat_cols_actual].astype('category')
+        X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
+        cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbose': -1
        }
        lgb_train = lgb.Dataset(X, y)
-        gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
+        gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
        pred0 = gbm0.predict(X_test)
        lgb_train = lgb.Dataset(X, pd.DataFrame(y))  # also test that label can be one-column pd.DataFrame
-        gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
+        gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
-                         categorical_feature=[0])
        pred1 = gbm1.predict(X_test)
        lgb_train = lgb.Dataset(X, pd.Series(y))  # also test that label can be pd.Series
-        gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
+        gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A'])
-                         categorical_feature=['A'])
        pred2 = gbm2.predict(X_test)
        lgb_train = lgb.Dataset(X, y)
-        gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
+        gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D'])
-                         categorical_feature=['A', 'B', 'C', 'D'])
        pred3 = gbm3.predict(X_test)
        gbm3.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
@@ -595,18 +598,25 @@ class TestEngine(unittest.TestCase):
        pred5 = gbm4.predict(X_test)
        gbm5 = lgb.Booster(model_str=model_str)
        pred6 = gbm5.predict(X_test)
+        lgb_train = lgb.Dataset(X, y)
+        gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['E'])
+        pred7 = gbm6.predict(X_test)
        np.testing.assert_almost_equal(pred0, pred1)
        np.testing.assert_almost_equal(pred0, pred2)
        np.testing.assert_almost_equal(pred0, pred3)
        np.testing.assert_almost_equal(pred0, pred4)
        np.testing.assert_almost_equal(pred0, pred5)
        np.testing.assert_almost_equal(pred0, pred6)
-        self.assertListEqual(gbm0.pandas_categorical, cat_cols)
+        self.assertRaises(AssertionError,
-        self.assertListEqual(gbm1.pandas_categorical, cat_cols)
+                          np.testing.assert_almost_equal,
-        self.assertListEqual(gbm2.pandas_categorical, cat_cols)
+                          pred0, pred7)  # ordered cat features aren't treated as cat features by default
-        self.assertListEqual(gbm3.pandas_categorical, cat_cols)
+        self.assertListEqual(gbm0.pandas_categorical, cat_values)
-        self.assertListEqual(gbm4.pandas_categorical, cat_cols)
+        self.assertListEqual(gbm1.pandas_categorical, cat_values)
-        self.assertListEqual(gbm5.pandas_categorical, cat_cols)
+        self.assertListEqual(gbm2.pandas_categorical, cat_values)
+        self.assertListEqual(gbm3.pandas_categorical, cat_values)
+        self.assertListEqual(gbm4.pandas_categorical, cat_values)
+        self.assertListEqual(gbm5.pandas_categorical, cat_values)
+        self.assertListEqual(gbm6.pandas_categorical, cat_values)
    def test_reference_chain(self):
        X = np.random.normal(size=(100, 2))

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -206,22 +206,29 @@ class TestSklearn(unittest.TestCase):
    @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
    def test_pandas_categorical(self):
        import pandas as pd
+        np.random.seed(42)  # sometimes there is no difference how E col is treated (cat or not cat)
        X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75),  # str
                          "B": np.random.permutation([1, 2, 3] * 100),  # int
                          "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60),  # float
-                          "D": np.random.permutation([True, False] * 150)})  # bool
+                          "D": np.random.permutation([True, False] * 150),  # bool
+                          "E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
+                                              ordered=True)})  # str and ordered categorical
        y = np.random.permutation([0, 1] * 150)
-        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
+        X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),  # unseen category
                               "B": np.random.permutation([1, 3] * 30),
                               "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
-                               "D": np.random.permutation([True, False] * 30)})
+                               "D": np.random.permutation([True, False] * 30),
-        cat_cols = []
+                               "E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
-        for col in ["A", "B", "C", "D"]:
+                                                   ordered=True)})
-            X[col] = X[col].astype('category')
+        np.random.seed()  # reset seed
-            X_test[col] = X_test[col].astype('category')
+        cat_cols_actual = ["A", "B", "C", "D"]
-            cat_cols.append(X[col].cat.categories.tolist())
+        cat_cols_to_store = cat_cols_actual + ["E"]
+        X[cat_cols_actual] = X[cat_cols_actual].astype('category')
+        X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
+        cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
        gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
        pred0 = gbm0.predict(X_test)
+        pred_prob = gbm0.predict_proba(X_test)[:, 1]
        gbm1 = lgb.sklearn.LGBMClassifier().fit(X, pd.Series(y), categorical_feature=[0])
        pred1 = gbm1.predict(X_test)
        gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
@@ -231,16 +238,21 @@ class TestSklearn(unittest.TestCase):
        gbm3.booster_.save_model('categorical.model')
        gbm4 = lgb.Booster(model_file='categorical.model')
        pred4 = gbm4.predict(X_test)
-        pred_prob = gbm0.predict_proba(X_test)[:, 1]
+        gbm5 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['E'])
+        pred5 = gbm5.predict(X_test)
        np.testing.assert_almost_equal(pred0, pred1)
        np.testing.assert_almost_equal(pred0, pred2)
        np.testing.assert_almost_equal(pred0, pred3)
        np.testing.assert_almost_equal(pred_prob, pred4)
-        self.assertListEqual(gbm0.booster_.pandas_categorical, cat_cols)
+        self.assertRaises(AssertionError,
-        self.assertListEqual(gbm1.booster_.pandas_categorical, cat_cols)
+                          np.testing.assert_almost_equal,
-        self.assertListEqual(gbm2.booster_.pandas_categorical, cat_cols)
+                          pred0, pred5)  # ordered cat features aren't treated as cat features by default
-        self.assertListEqual(gbm3.booster_.pandas_categorical, cat_cols)
+        self.assertListEqual(gbm0.booster_.pandas_categorical, cat_values)
-        self.assertListEqual(gbm4.pandas_categorical, cat_cols)
+        self.assertListEqual(gbm1.booster_.pandas_categorical, cat_values)
+        self.assertListEqual(gbm2.booster_.pandas_categorical, cat_values)
+        self.assertListEqual(gbm3.booster_.pandas_categorical, cat_values)
+        self.assertListEqual(gbm4.pandas_categorical, cat_values)
+        self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
    def test_predict(self):
        iris = load_iris()