Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
ee511201
Unverified
Commit
ee511201
authored
Sep 06, 2023
by
James Lamb
Committed by
GitHub
Sep 06, 2023
Browse files
[python-package] simplify processing of pandas data (#6066)
parent
82033064
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
83 additions
and
60 deletions
+83
-60
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+62
-56
python-package/lightgbm/plotting.py
python-package/lightgbm/plotting.py
+2
-2
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+19
-2
No files found.
python-package/lightgbm/basic.py
View file @
ee511201
...
@@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
...
@@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
def
_data_from_pandas
(
def
_data_from_pandas
(
data
,
data
:
pd_DataFrame
,
feature_name
:
Optional
[
_LGBM_FeatureNameConfiguration
]
,
feature_name
:
_LGBM_FeatureNameConfiguration
,
categorical_feature
:
Optional
[
_LGBM_CategoricalFeatureConfiguration
]
,
categorical_feature
:
_LGBM_CategoricalFeatureConfiguration
,
pandas_categorical
:
Optional
[
List
[
List
]]
pandas_categorical
:
Optional
[
List
[
List
]]
):
)
->
Tuple
[
np
.
ndarray
,
List
[
str
],
List
[
str
],
List
[
List
]]:
if
isinstance
(
data
,
pd_DataFrame
):
if
len
(
data
.
shape
)
!=
2
or
data
.
shape
[
0
]
<
1
:
if
len
(
data
.
shape
)
!=
2
or
data
.
shape
[
0
]
<
1
:
raise
ValueError
(
'Input data must be 2 dimensional and non empty.'
)
raise
ValueError
(
'Input data must be 2 dimensional and non empty.'
)
if
feature_name
==
'auto'
or
feature_name
is
None
:
# determine feature names
data
=
data
.
rename
(
columns
=
str
,
copy
=
False
)
if
feature_name
==
'auto'
:
cat_cols
=
[
col
for
col
,
dtype
in
zip
(
data
.
columns
,
data
.
dtypes
)
if
isinstance
(
dtype
,
pd_CategoricalDtype
)]
feature_name
=
[
str
(
col
)
for
col
in
data
.
columns
]
cat_cols_not_ordered
=
[
col
for
col
in
cat_cols
if
not
data
[
col
].
cat
.
ordered
]
if
pandas_categorical
is
None
:
# train dataset
# determine categorical features
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
cat_cols
=
[
col
for
col
,
dtype
in
zip
(
data
.
columns
,
data
.
dtypes
)
if
isinstance
(
dtype
,
pd_CategoricalDtype
)]
else
:
cat_cols_not_ordered
=
[
col
for
col
in
cat_cols
if
not
data
[
col
].
cat
.
ordered
]
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
if
pandas_categorical
is
None
:
# train dataset
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is list
data
=
data
.
copy
(
deep
=
False
)
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
).
replace
({
-
1
:
np
.
nan
})
if
categorical_feature
is
not
None
:
if
feature_name
is
None
:
feature_name
=
list
(
data
.
columns
)
if
categorical_feature
==
'auto'
:
# use cat cols from DataFrame
categorical_feature
=
cat_cols_not_ordered
else
:
# use cat cols specified by user
categorical_feature
=
list
(
categorical_feature
)
# type: ignore[assignment]
if
feature_name
==
'auto'
:
feature_name
=
list
(
data
.
columns
)
_check_for_bad_pandas_dtypes
(
data
.
dtypes
)
df_dtypes
=
[
dtype
.
type
for
dtype
in
data
.
dtypes
]
df_dtypes
.
append
(
np
.
float32
)
# so that the target dtype considers floats
target_dtype
=
np
.
result_type
(
*
df_dtypes
)
try
:
# most common case (no nullable dtypes)
data
=
data
.
to_numpy
(
dtype
=
target_dtype
,
copy
=
False
)
except
TypeError
:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data
=
data
.
astype
(
target_dtype
,
copy
=
False
).
values
except
ValueError
:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data
=
data
.
to_numpy
(
dtype
=
target_dtype
,
na_value
=
np
.
nan
)
else
:
else
:
if
feature_name
==
'auto'
:
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
feature_name
=
None
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
if
categorical_feature
==
'auto'
:
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
categorical_feature
=
None
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is list
data
=
data
.
copy
(
deep
=
False
)
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
).
replace
({
-
1
:
np
.
nan
})
if
categorical_feature
==
'auto'
:
# use cat cols from DataFrame
categorical_feature
=
cat_cols_not_ordered
else
:
# use cat cols specified by user
categorical_feature
=
list
(
categorical_feature
)
# type: ignore[assignment]
# get numpy representation of the data
_check_for_bad_pandas_dtypes
(
data
.
dtypes
)
df_dtypes
=
[
dtype
.
type
for
dtype
in
data
.
dtypes
]
df_dtypes
.
append
(
np
.
float32
)
# so that the target dtype considers floats
target_dtype
=
np
.
result_type
(
*
df_dtypes
)
try
:
# most common case (no nullable dtypes)
data
=
data
.
to_numpy
(
dtype
=
target_dtype
,
copy
=
False
)
except
TypeError
:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data
=
data
.
astype
(
target_dtype
,
copy
=
False
).
values
except
ValueError
:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data
=
data
.
to_numpy
(
dtype
=
target_dtype
,
na_value
=
np
.
nan
)
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
...
@@ -1004,7 +999,15 @@ class _InnerPredictor:
...
@@ -1004,7 +999,15 @@ class _InnerPredictor:
ctypes
.
c_int
(
len
(
data_names
)),
ctypes
.
c_int
(
len
(
data_names
)),
)
)
)
)
data
=
_data_from_pandas
(
data
,
None
,
None
,
self
.
pandas_categorical
)[
0
]
if
isinstance
(
data
,
pd_DataFrame
):
data
=
_data_from_pandas
(
data
=
data
,
feature_name
=
"auto"
,
categorical_feature
=
"auto"
,
pandas_categorical
=
self
.
pandas_categorical
)[
0
]
predict_type
=
_C_API_PREDICT_NORMAL
predict_type
=
_C_API_PREDICT_NORMAL
if
raw_score
:
if
raw_score
:
predict_type
=
_C_API_PREDICT_RAW_SCORE
predict_type
=
_C_API_PREDICT_RAW_SCORE
...
@@ -1854,10 +1857,13 @@ class Dataset:
...
@@ -1854,10 +1857,13 @@ class Dataset:
if
reference
is
not
None
:
if
reference
is
not
None
:
self
.
pandas_categorical
=
reference
.
pandas_categorical
self
.
pandas_categorical
=
reference
.
pandas_categorical
categorical_feature
=
reference
.
categorical_feature
categorical_feature
=
reference
.
categorical_feature
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
data
=
data
,
if
isinstance
(
data
,
pd_DataFrame
):
feature_name
=
feature_name
,
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
categorical_feature
=
categorical_feature
,
data
=
data
,
pandas_categorical
=
self
.
pandas_categorical
)
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
pandas_categorical
=
self
.
pandas_categorical
)
# process for args
# process for args
params
=
{}
if
params
is
None
else
params
params
=
{}
if
params
is
None
else
params
...
@@ -1867,10 +1873,10 @@ class Dataset:
...
@@ -1867,10 +1873,10 @@ class Dataset:
_log_warning
(
f
'
{
key
}
keyword has been found in `params` and will be ignored.
\n
'
_log_warning
(
f
'
{
key
}
keyword has been found in `params` and will be ignored.
\n
'
f
'Please use
{
key
}
argument of the Dataset constructor to pass this parameter.'
)
f
'Please use
{
key
}
argument of the Dataset constructor to pass this parameter.'
)
# get categorical features
# get categorical features
if
categorical_feature
is
not
None
:
if
isinstance
(
categorical_feature
,
list
)
:
categorical_indices
=
set
()
categorical_indices
=
set
()
feature_dict
=
{}
feature_dict
=
{}
if
feature_name
is
not
None
:
if
isinstance
(
feature_name
,
list
)
:
feature_dict
=
{
name
:
i
for
i
,
name
in
enumerate
(
feature_name
)}
feature_dict
=
{
name
:
i
for
i
,
name
in
enumerate
(
feature_name
)}
for
name
in
categorical_feature
:
for
name
in
categorical_feature
:
if
isinstance
(
name
,
str
)
and
name
in
feature_dict
:
if
isinstance
(
name
,
str
)
and
name
in
feature_dict
:
...
...
python-package/lightgbm/plotting.py
View file @
ee511201
...
@@ -712,8 +712,8 @@ def create_tree_digraph(
...
@@ -712,8 +712,8 @@ def create_tree_digraph(
if
isinstance
(
example_case
,
pd_DataFrame
):
if
isinstance
(
example_case
,
pd_DataFrame
):
example_case
=
_data_from_pandas
(
example_case
=
_data_from_pandas
(
data
=
example_case
,
data
=
example_case
,
feature_name
=
None
,
feature_name
=
"auto"
,
categorical_feature
=
None
,
categorical_feature
=
"auto"
,
pandas_categorical
=
booster
.
pandas_categorical
pandas_categorical
=
booster
.
pandas_categorical
)[
0
]
)[
0
]
example_case
=
example_case
[
0
]
example_case
=
example_case
[
0
]
...
...
tests/python_package_test/test_basic.py
View file @
ee511201
...
@@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
...
@@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd
=
pytest
.
importorskip
(
'pandas'
)
pd
=
pytest
.
importorskip
(
'pandas'
)
X
=
np
.
random
.
rand
(
10
,
2
).
astype
(
dtype
)
X
=
np
.
random
.
rand
(
10
,
2
).
astype
(
dtype
)
df
=
pd
.
DataFrame
(
X
)
df
=
pd
.
DataFrame
(
X
)
built_data
=
lgb
.
basic
.
_data_from_pandas
(
df
,
feature_name
,
None
,
None
)[
0
]
built_data
=
lgb
.
basic
.
_data_from_pandas
(
data
=
df
,
feature_name
=
feature_name
,
categorical_feature
=
"auto"
,
pandas_categorical
=
None
)[
0
]
assert
built_data
.
dtype
==
dtype
assert
built_data
.
dtype
==
dtype
assert
np
.
shares_memory
(
X
,
built_data
)
assert
np
.
shares_memory
(
X
,
built_data
)
...
@@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
...
@@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
X
=
np
.
random
.
choice
([
'a'
,
'b'
],
100
).
reshape
(
-
1
,
1
)
X
=
np
.
random
.
choice
([
'a'
,
'b'
],
100
).
reshape
(
-
1
,
1
)
column_name
=
'a'
if
feature_name
==
'auto'
else
feature_name
[
0
]
column_name
=
'a'
if
feature_name
==
'auto'
else
feature_name
[
0
]
df
=
pd
.
DataFrame
(
X
.
copy
(),
columns
=
[
column_name
],
dtype
=
'category'
)
df
=
pd
.
DataFrame
(
X
.
copy
(),
columns
=
[
column_name
],
dtype
=
'category'
)
data
=
lgb
.
basic
.
_data_from_pandas
(
df
,
feature_name
,
None
,
None
)[
0
]
data
=
lgb
.
basic
.
_data_from_pandas
(
data
=
df
,
feature_name
=
feature_name
,
categorical_feature
=
"auto"
,
pandas_categorical
=
None
)[
0
]
# check that the original data wasn't modified
# check that the original data wasn't modified
np
.
testing
.
assert_equal
(
df
[
column_name
],
X
[:,
0
])
np
.
testing
.
assert_equal
(
df
[
column_name
],
X
[:,
0
])
# check that the built data has the codes
# check that the built data has the codes
...
@@ -806,3 +816,10 @@ def test_set_leaf_output():
...
@@ -806,3 +816,10 @@ def test_set_leaf_output():
leaf_output
=
bst
.
get_leaf_output
(
tree_id
=
0
,
leaf_id
=
leaf_id
)
leaf_output
=
bst
.
get_leaf_output
(
tree_id
=
0
,
leaf_id
=
leaf_id
)
bst
.
set_leaf_output
(
tree_id
=
0
,
leaf_id
=
leaf_id
,
value
=
leaf_output
+
1
)
bst
.
set_leaf_output
(
tree_id
=
0
,
leaf_id
=
leaf_id
,
value
=
leaf_output
+
1
)
np
.
testing
.
assert_allclose
(
bst
.
predict
(
X
),
y_pred
+
1
)
np
.
testing
.
assert_allclose
(
bst
.
predict
(
X
),
y_pred
+
1
)
def
test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset
():
ds
=
lgb
.
Dataset
(
data
=
np
.
random
.
randn
(
100
,
3
),
)
assert
ds
.
construct
().
feature_name
==
[
"Column_0"
,
"Column_1"
,
"Column_2"
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment