Unverified Commit ee511201 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[python-package] simplify processing of pandas data (#6066)

parent 82033064
...@@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None: ...@@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
def _data_from_pandas( def _data_from_pandas(
data, data: pd_DataFrame,
feature_name: Optional[_LGBM_FeatureNameConfiguration], feature_name: _LGBM_FeatureNameConfiguration,
categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration], categorical_feature: _LGBM_CategoricalFeatureConfiguration,
pandas_categorical: Optional[List[List]] pandas_categorical: Optional[List[List]]
): ) -> Tuple[np.ndarray, List[str], List[str], List[List]]:
if isinstance(data, pd_DataFrame): if len(data.shape) != 2 or data.shape[0] < 1:
if len(data.shape) != 2 or data.shape[0] < 1: raise ValueError('Input data must be 2 dimensional and non empty.')
raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None: # determine feature names
data = data.rename(columns=str, copy=False) if feature_name == 'auto':
cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] feature_name = [str(col) for col in data.columns]
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset # determine categorical features
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols] cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
else: cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if len(cat_cols) != len(pandas_categorical): if pandas_categorical is None: # train dataset
raise ValueError('train and valid dataset categorical_feature do not match.') pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
else: # use cat cols specified by user
categorical_feature = list(categorical_feature) # type: ignore[assignment]
if feature_name == 'auto':
feature_name = list(data.columns)
_check_for_bad_pandas_dtypes(data.dtypes)
df_dtypes = [dtype.type for dtype in data.dtypes]
df_dtypes.append(np.float32) # so that the target dtype considers floats
target_dtype = np.result_type(*df_dtypes)
try:
# most common case (no nullable dtypes)
data = data.to_numpy(dtype=target_dtype, copy=False)
except TypeError:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data = data.astype(target_dtype, copy=False).values
except ValueError:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
else: else:
if feature_name == 'auto': if len(cat_cols) != len(pandas_categorical):
feature_name = None raise ValueError('train and valid dataset categorical_feature do not match.')
if categorical_feature == 'auto': for col, category in zip(cat_cols, pandas_categorical):
categorical_feature = None if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
else: # use cat cols specified by user
categorical_feature = list(categorical_feature) # type: ignore[assignment]
# get numpy representation of the data
_check_for_bad_pandas_dtypes(data.dtypes)
df_dtypes = [dtype.type for dtype in data.dtypes]
df_dtypes.append(np.float32) # so that the target dtype considers floats
target_dtype = np.result_type(*df_dtypes)
try:
# most common case (no nullable dtypes)
data = data.to_numpy(dtype=target_dtype, copy=False)
except TypeError:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data = data.astype(target_dtype, copy=False).values
except ValueError:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
return data, feature_name, categorical_feature, pandas_categorical return data, feature_name, categorical_feature, pandas_categorical
...@@ -1004,7 +999,15 @@ class _InnerPredictor: ...@@ -1004,7 +999,15 @@ class _InnerPredictor:
ctypes.c_int(len(data_names)), ctypes.c_int(len(data_names)),
) )
) )
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]
if isinstance(data, pd_DataFrame):
data = _data_from_pandas(
data=data,
feature_name="auto",
categorical_feature="auto",
pandas_categorical=self.pandas_categorical
)[0]
predict_type = _C_API_PREDICT_NORMAL predict_type = _C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = _C_API_PREDICT_RAW_SCORE predict_type = _C_API_PREDICT_RAW_SCORE
...@@ -1854,10 +1857,13 @@ class Dataset: ...@@ -1854,10 +1857,13 @@ class Dataset:
if reference is not None: if reference is not None:
self.pandas_categorical = reference.pandas_categorical self.pandas_categorical = reference.pandas_categorical
categorical_feature = reference.categorical_feature categorical_feature = reference.categorical_feature
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data, if isinstance(data, pd_DataFrame):
feature_name=feature_name, data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(
categorical_feature=categorical_feature, data=data,
pandas_categorical=self.pandas_categorical) feature_name=feature_name,
categorical_feature=categorical_feature,
pandas_categorical=self.pandas_categorical
)
# process for args # process for args
params = {} if params is None else params params = {} if params is None else params
...@@ -1867,10 +1873,10 @@ class Dataset: ...@@ -1867,10 +1873,10 @@ class Dataset:
_log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
f'Please use {key} argument of the Dataset constructor to pass this parameter.') f'Please use {key} argument of the Dataset constructor to pass this parameter.')
# get categorical features # get categorical features
if categorical_feature is not None: if isinstance(categorical_feature, list):
categorical_indices = set() categorical_indices = set()
feature_dict = {} feature_dict = {}
if feature_name is not None: if isinstance(feature_name, list):
feature_dict = {name: i for i, name in enumerate(feature_name)} feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature: for name in categorical_feature:
if isinstance(name, str) and name in feature_dict: if isinstance(name, str) and name in feature_dict:
......
...@@ -712,8 +712,8 @@ def create_tree_digraph( ...@@ -712,8 +712,8 @@ def create_tree_digraph(
if isinstance(example_case, pd_DataFrame): if isinstance(example_case, pd_DataFrame):
example_case = _data_from_pandas( example_case = _data_from_pandas(
data=example_case, data=example_case,
feature_name=None, feature_name="auto",
categorical_feature=None, categorical_feature="auto",
pandas_categorical=booster.pandas_categorical pandas_categorical=booster.pandas_categorical
)[0] )[0]
example_case = example_case[0] example_case = example_case[0]
......
...@@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): ...@@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas') pd = pytest.importorskip('pandas')
X = np.random.rand(10, 2).astype(dtype) X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X) df = pd.DataFrame(X)
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] built_data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0]
assert built_data.dtype == dtype assert built_data.dtype == dtype
assert np.shares_memory(X, built_data) assert np.shares_memory(X, built_data)
...@@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): ...@@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0] column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0]
# check that the original data wasn't modified # check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0]) np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes # check that the built data has the codes
...@@ -806,3 +816,10 @@ def test_set_leaf_output(): ...@@ -806,3 +816,10 @@ def test_set_leaf_output():
leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id) leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1) bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
np.testing.assert_allclose(bst.predict(X), y_pred + 1) np.testing.assert_allclose(bst.predict(X), y_pred + 1)
def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
ds = lgb.Dataset(
data=np.random.randn(100, 3),
)
assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment