Unverified Commit 65b3db1c authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package] make a shallow copy on dataframe rename (fixes #4596) (#5254)

* dont copy dataframe on rename

* test with feature_name and 'auto'
parent f7156457
...@@ -537,7 +537,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -537,7 +537,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
if len(data.shape) != 2 or data.shape[0] < 1: if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.') raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None: if feature_name == 'auto' or feature_name is None:
data = data.rename(columns=str) data = data.rename(columns=str, copy=False)
cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)] cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered] cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset if pandas_categorical is None: # train dataset
......
...@@ -644,22 +644,22 @@ def test_custom_objective_safety(): ...@@ -644,22 +644,22 @@ def test_custom_objective_safety():
@pytest.mark.parametrize('dtype', [np.float32, np.float64]) @pytest.mark.parametrize('dtype', [np.float32, np.float64])
def test_no_copy_when_single_float_dtype_dataframe(dtype): @pytest.mark.parametrize('feature_name', [['x1', 'x2'], 'auto'])
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas') pd = pytest.importorskip('pandas')
X = np.random.rand(10, 2).astype(dtype) X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X) df = pd.DataFrame(X)
# feature names are required to not make a copy (rename makes a copy)
feature_name = ['x1', 'x2']
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0] built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
assert built_data.dtype == dtype assert built_data.dtype == dtype
assert np.shares_memory(X, built_data) assert np.shares_memory(X, built_data)
def test_categorical_code_conversion_doesnt_modify_original_data(): @pytest.mark.parametrize('feature_name', [['x1'], 'auto'])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
pd = pytest.importorskip('pandas') pd = pytest.importorskip('pandas')
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1) X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
df = pd.DataFrame(X.copy(), columns=['x1'], dtype='category') df = pd.DataFrame(X.copy(), columns=['x1'], dtype='category')
data = lgb.basic._data_from_pandas(df, ['x1'], None, None)[0] data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
# check that the original data wasn't modified # check that the original data wasn't modified
np.testing.assert_equal(df['x1'], X[:, 0]) np.testing.assert_equal(df['x1'], X[:, 0])
# check that the built data has the codes # check that the built data has the codes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment