Unverified Commit e7979852 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package] take shallow copy of dataframe in predict (fixes #6195) (#6218)

parent 4aba4fc1
...@@ -789,6 +789,10 @@ def _data_from_pandas( ...@@ -789,6 +789,10 @@ def _data_from_pandas(
if len(data.shape) != 2 or data.shape[0] < 1: if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.') raise ValueError('Input data must be 2 dimensional and non empty.')
# take shallow copy in case we modify categorical columns
# whole column modifications don't change the original df
data = data.copy(deep=False)
# determine feature names # determine feature names
if feature_name == 'auto': if feature_name == 'auto':
feature_name = [str(col) for col in data.columns] feature_name = [str(col) for col in data.columns]
...@@ -805,7 +809,6 @@ def _data_from_pandas( ...@@ -805,7 +809,6 @@ def _data_from_pandas(
if list(data[col].cat.categories) != list(category): if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category) data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan}) data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature == 'auto': # use cat cols from DataFrame if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered categorical_feature = cat_cols_not_ordered
......
...@@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): ...@@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
@pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto']) @pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name): @pytest.mark.parametrize('categories', ['seen', 'unseen'])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
pd = pytest.importorskip('pandas') pd = pytest.importorskip('pandas')
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0] column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category') df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
if categories == 'seen':
pandas_categorical = [['a', 'b']]
else:
pandas_categorical = [['a']]
data = lgb.basic._data_from_pandas( data = lgb.basic._data_from_pandas(
data=df, data=df,
feature_name=feature_name, feature_name=feature_name,
categorical_feature="auto", categorical_feature="auto",
pandas_categorical=None pandas_categorical=pandas_categorical,
)[0] )[0]
# check that the original data wasn't modified # check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0]) np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes # check that the built data has the codes
np.testing.assert_equal(df[column_name].cat.codes, data[:, 0]) if categories == 'seen':
# if all categories were seen during training we just take the codes
codes = df[column_name].cat.codes
else:
# if we only saw 'a' during training we just replace its code
# and leave the rest as nan
a_code = df[column_name].cat.categories.get_loc('a')
codes = np.where(df[column_name] == 'a', a_code, np.nan)
np.testing.assert_equal(codes, data[:, 0])
@pytest.mark.parametrize('min_data_in_bin', [2, 10]) @pytest.mark.parametrize('min_data_in_bin', [2, 10])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment