Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
e7979852
Unverified
Commit
e7979852
authored
Dec 07, 2023
by
José Morales
Committed by
GitHub
Dec 07, 2023
Browse files
[python-package] take shallow copy of dataframe in predict (fixes #6195) (#6218)
parent
4aba4fc1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
4 deletions
+20
-4
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+4
-1
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+16
-3
No files found.
python-package/lightgbm/basic.py
View file @
e7979852
...
...
@@ -789,6 +789,10 @@ def _data_from_pandas(
if
len
(
data
.
shape
)
!=
2
or
data
.
shape
[
0
]
<
1
:
raise
ValueError
(
'Input data must be 2 dimensional and non empty.'
)
# take shallow copy in case we modify categorical columns
# whole column modifications don't change the original df
data
=
data
.
copy
(
deep
=
False
)
# determine feature names
if
feature_name
==
'auto'
:
feature_name
=
[
str
(
col
)
for
col
in
data
.
columns
]
...
...
@@ -805,7 +809,6 @@ def _data_from_pandas(
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is list
data
=
data
.
copy
(
deep
=
False
)
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
).
replace
({
-
1
:
np
.
nan
})
if
categorical_feature
==
'auto'
:
# use cat cols from DataFrame
categorical_feature
=
cat_cols_not_ordered
...
...
tests/python_package_test/test_basic.py
View file @
e7979852
...
...
@@ -822,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
@
pytest
.
mark
.
parametrize
(
'feature_name'
,
[[
'x1'
],
[
42
],
'auto'
])
def
test_categorical_code_conversion_doesnt_modify_original_data
(
feature_name
):
@
pytest
.
mark
.
parametrize
(
'categories'
,
[
'seen'
,
'unseen'
])
def
test_categorical_code_conversion_doesnt_modify_original_data
(
feature_name
,
categories
):
pd
=
pytest
.
importorskip
(
'pandas'
)
X
=
np
.
random
.
choice
([
'a'
,
'b'
],
100
).
reshape
(
-
1
,
1
)
column_name
=
'a'
if
feature_name
==
'auto'
else
feature_name
[
0
]
df
=
pd
.
DataFrame
(
X
.
copy
(),
columns
=
[
column_name
],
dtype
=
'category'
)
if
categories
==
'seen'
:
pandas_categorical
=
[[
'a'
,
'b'
]]
else
:
pandas_categorical
=
[[
'a'
]]
data
=
lgb
.
basic
.
_data_from_pandas
(
data
=
df
,
feature_name
=
feature_name
,
categorical_feature
=
"auto"
,
pandas_categorical
=
None
pandas_categorical
=
pandas_categorical
,
)[
0
]
# check that the original data wasn't modified
np
.
testing
.
assert_equal
(
df
[
column_name
],
X
[:,
0
])
# check that the built data has the codes
np
.
testing
.
assert_equal
(
df
[
column_name
].
cat
.
codes
,
data
[:,
0
])
if
categories
==
'seen'
:
# if all categories were seen during training we just take the codes
codes
=
df
[
column_name
].
cat
.
codes
else
:
# if we only saw 'a' during training we just replace its code
# and leave the rest as nan
a_code
=
df
[
column_name
].
cat
.
categories
.
get_loc
(
'a'
)
codes
=
np
.
where
(
df
[
column_name
]
==
'a'
,
a_code
,
np
.
nan
)
np
.
testing
.
assert_equal
(
codes
,
data
[:,
0
])
@
pytest
.
mark
.
parametrize
(
'min_data_in_bin'
,
[
2
,
10
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment