[python-package] simplify processing of pandas data (#6066)

ee511201 · James Lamb · GitHub · 82033064 · ee511201 · ee511201
Unverified Commit ee511201 authored Sep 06, 2023 by James Lamb Committed by GitHub Sep 06, 2023
3 changed files
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
 def _data_from_pandas(
-    data,
+    data: pd_DataFrame,
-    feature_name: Optional[_LGBM_FeatureNameConfiguration],
+    feature_name: _LGBM_FeatureNameConfiguration,
-    categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration],
+    categorical_feature: _LGBM_CategoricalFeatureConfiguration,
    pandas_categorical: Optional[List[List]]
-):
+) -> Tuple[np.ndarray, List[str], List[str], List[List]]:
-    if isinstance(data, pd_DataFrame):
+    if len(data.shape) != 2 or data.shape[0] < 1:
-        if len(data.shape) != 2 or data.shape[0] < 1:
+        raise ValueError('Input data must be 2 dimensional and non empty.')
-            raise ValueError('Input data must be 2 dimensional and non empty.')
-        if feature_name == 'auto' or feature_name is None:
+    # determine feature names
-            data = data.rename(columns=str, copy=False)
+    if feature_name == 'auto':
-        cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
+        feature_name = [str(col) for col in data.columns]
-        cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
-        if pandas_categorical is None:  # train dataset
+    # determine categorical features
-            pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
+    cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
-        else:
+    cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
-            if len(cat_cols) != len(pandas_categorical):
+    if pandas_categorical is None:  # train dataset
-                raise ValueError('train and valid dataset categorical_feature do not match.')
+        pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
-            for col, category in zip(cat_cols, pandas_categorical):
-                if list(data[col].cat.categories) != list(category):
-                    data[col] = data[col].cat.set_categories(category)
-        if len(cat_cols):  # cat_cols is list
-            data = data.copy(deep=False)  # not alter origin DataFrame
-            data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
-        if categorical_feature is not None:
-            if feature_name is None:
-                feature_name = list(data.columns)
-            if categorical_feature == 'auto':  # use cat cols from DataFrame
-                categorical_feature = cat_cols_not_ordered
-            else:  # use cat cols specified by user
-                categorical_feature = list(categorical_feature)  # type: ignore[assignment]
-        if feature_name == 'auto':
-            feature_name = list(data.columns)
-        _check_for_bad_pandas_dtypes(data.dtypes)
-        df_dtypes = [dtype.type for dtype in data.dtypes]
-        df_dtypes.append(np.float32)  # so that the target dtype considers floats
-        target_dtype = np.result_type(*df_dtypes)
-        try:
-            # most common case (no nullable dtypes)
-            data = data.to_numpy(dtype=target_dtype, copy=False)
-        except TypeError:
-            # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
-            # raises error because array is casted to type(pd.NA) and there's no na_value argument
-            data = data.astype(target_dtype, copy=False).values
-        except ValueError:
-            # data has nullable dtypes, but we can specify na_value argument and copy will be made
-            data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
    else:
-        if feature_name == 'auto':
+        if len(cat_cols) != len(pandas_categorical):
-            feature_name = None
+            raise ValueError('train and valid dataset categorical_feature do not match.')
-        if categorical_feature == 'auto':
+        for col, category in zip(cat_cols, pandas_categorical):
-            categorical_feature = None
+            if list(data[col].cat.categories) != list(category):
+                data[col] = data[col].cat.set_categories(category)
+    if len(cat_cols):  # cat_cols is list
+        data = data.copy(deep=False)  # not alter origin DataFrame
+        data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
+    if categorical_feature == 'auto':  # use cat cols from DataFrame
+        categorical_feature = cat_cols_not_ordered
+    else:  # use cat cols specified by user
+        categorical_feature = list(categorical_feature)  # type: ignore[assignment]
+    # get numpy representation of the data
+    _check_for_bad_pandas_dtypes(data.dtypes)
+    df_dtypes = [dtype.type for dtype in data.dtypes]
+    df_dtypes.append(np.float32)  # so that the target dtype considers floats
+    target_dtype = np.result_type(*df_dtypes)
+    try:
+        # most common case (no nullable dtypes)
+        data = data.to_numpy(dtype=target_dtype, copy=False)
+    except TypeError:
+        # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
+        # raises error because array is casted to type(pd.NA) and there's no na_value argument
+        data = data.astype(target_dtype, copy=False).values
+    except ValueError:
+        # data has nullable dtypes, but we can specify na_value argument and copy will be made
+        data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
    return data, feature_name, categorical_feature, pandas_categorical
@@ -1004,7 +999,15 @@ class _InnerPredictor:
                    ctypes.c_int(len(data_names)),
                )
            )
-        data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
+        if isinstance(data, pd_DataFrame):
+            data = _data_from_pandas(
+                data=data,
+                feature_name="auto",
+                categorical_feature="auto",
+                pandas_categorical=self.pandas_categorical
+            )[0]
        predict_type = _C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = _C_API_PREDICT_RAW_SCORE
@@ -1854,10 +1857,13 @@ class Dataset:
        if reference is not None:
            self.pandas_categorical = reference.pandas_categorical
            categorical_feature = reference.categorical_feature
-        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data,
+        if isinstance(data, pd_DataFrame):
-                                                                                             feature_name=feature_name,
+            data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(
-                                                                                             categorical_feature=categorical_feature,
+                data=data,
-                                                                                             pandas_categorical=self.pandas_categorical)
+                feature_name=feature_name,
+                categorical_feature=categorical_feature,
+                pandas_categorical=self.pandas_categorical
+            )
        # process for args
        params = {} if params is None else params
@@ -1867,10 +1873,10 @@ class Dataset:
                _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
                             f'Please use {key} argument of the Dataset constructor to pass this parameter.')
        # get categorical features
-        if categorical_feature is not None:
+        if isinstance(categorical_feature, list):
            categorical_indices = set()
            feature_dict = {}
-            if feature_name is not None:
+            if isinstance(feature_name, list):
                feature_dict = {name: i for i, name in enumerate(feature_name)}
            for name in categorical_feature:
                if isinstance(name, str) and name in feature_dict:

--- a/python-package/lightgbm/plotting.py
+++ b/python-package/lightgbm/plotting.py
@@ -712,8 +712,8 @@ def create_tree_digraph(
        if isinstance(example_case, pd_DataFrame):
            example_case = _data_from_pandas(
                data=example_case,
-                feature_name=None,
+                feature_name="auto",
-                categorical_feature=None,
+                categorical_feature="auto",
                pandas_categorical=booster.pandas_categorical
            )[0]
        example_case = example_case[0]

--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
    pd = pytest.importorskip('pandas')
    X = np.random.rand(10, 2).astype(dtype)
    df = pd.DataFrame(X)
-    built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
+    built_data = lgb.basic._data_from_pandas(
+        data=df,
+        feature_name=feature_name,
+        categorical_feature="auto",
+        pandas_categorical=None
+    )[0]
    assert built_data.dtype == dtype
    assert np.shares_memory(X, built_data)
@@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
    X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
    column_name = 'a' if feature_name == 'auto' else feature_name[0]
    df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
-    data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
+    data = lgb.basic._data_from_pandas(
+        data=df,
+        feature_name=feature_name,
+        categorical_feature="auto",
+        pandas_categorical=None
+    )[0]
    # check that the original data wasn't modified
    np.testing.assert_equal(df[column_name], X[:, 0])
    # check that the built data has the codes
@@ -806,3 +816,10 @@ def test_set_leaf_output():
        leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
        bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
    np.testing.assert_allclose(bst.predict(X), y_pred + 1)
+def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
+    ds = lgb.Dataset(
+        data=np.random.randn(100, 3),
+    )
+    assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]