Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
18dbd65e
Unverified
Commit
18dbd65e
authored
Nov 15, 2023
by
James Lamb
Committed by
GitHub
Nov 15, 2023
Browse files
[python-package] consolidate pandas-to-numpy conversion code (#6156)
parent
e63e54ac
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
26 deletions
+27
-26
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+27
-26
No files found.
python-package/lightgbm/basic.py
View file @
18dbd65e
...
@@ -758,6 +758,23 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
...
@@ -758,6 +758,23 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
f
'Fields with bad pandas dtypes:
{
", "
.
join
(
bad_pandas_dtypes
)
}
'
)
f
'Fields with bad pandas dtypes:
{
", "
.
join
(
bad_pandas_dtypes
)
}
'
)
def
_pandas_to_numpy
(
data
:
pd_DataFrame
,
target_dtype
:
"np.typing.DTypeLike"
)
->
np
.
ndarray
:
_check_for_bad_pandas_dtypes
(
data
.
dtypes
)
try
:
# most common case (no nullable dtypes)
return
data
.
to_numpy
(
dtype
=
target_dtype
,
copy
=
False
)
except
TypeError
:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
return
data
.
astype
(
target_dtype
,
copy
=
False
).
values
except
ValueError
:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
return
data
.
to_numpy
(
dtype
=
target_dtype
,
na_value
=
np
.
nan
)
def
_data_from_pandas
(
def
_data_from_pandas
(
data
:
pd_DataFrame
,
data
:
pd_DataFrame
,
feature_name
:
_LGBM_FeatureNameConfiguration
,
feature_name
:
_LGBM_FeatureNameConfiguration
,
...
@@ -790,22 +807,17 @@ def _data_from_pandas(
...
@@ -790,22 +807,17 @@ def _data_from_pandas(
else
:
# use cat cols specified by user
else
:
# use cat cols specified by user
categorical_feature
=
list
(
categorical_feature
)
# type: ignore[assignment]
categorical_feature
=
list
(
categorical_feature
)
# type: ignore[assignment]
# get numpy representation of the data
_check_for_bad_pandas_dtypes
(
data
.
dtypes
)
df_dtypes
=
[
dtype
.
type
for
dtype
in
data
.
dtypes
]
df_dtypes
=
[
dtype
.
type
for
dtype
in
data
.
dtypes
]
df_dtypes
.
append
(
np
.
float32
)
# so that the target dtype considers floats
# so that the target dtype considers floats
df_dtypes
.
append
(
np
.
float32
)
target_dtype
=
np
.
result_type
(
*
df_dtypes
)
target_dtype
=
np
.
result_type
(
*
df_dtypes
)
try
:
# most common case (no nullable dtypes)
return
(
data
=
data
.
to_numpy
(
dtype
=
target_dtype
,
copy
=
False
)
_pandas_to_numpy
(
data
,
target_dtype
=
target_dtype
),
except
TypeError
:
feature_name
,
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
categorical_feature
,
# raises error because array is casted to type(pd.NA) and there's no na_value argument
pandas_categorical
data
=
data
.
astype
(
target_dtype
,
copy
=
False
).
values
)
except
ValueError
:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data
=
data
.
to_numpy
(
dtype
=
target_dtype
,
na_value
=
np
.
nan
)
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
def
_dump_pandas_categorical
(
def
_dump_pandas_categorical
(
...
@@ -2805,18 +2817,7 @@ class Dataset:
...
@@ -2805,18 +2817,7 @@ class Dataset:
if
isinstance
(
label
,
pd_DataFrame
):
if
isinstance
(
label
,
pd_DataFrame
):
if
len
(
label
.
columns
)
>
1
:
if
len
(
label
.
columns
)
>
1
:
raise
ValueError
(
'DataFrame for label cannot have multiple columns'
)
raise
ValueError
(
'DataFrame for label cannot have multiple columns'
)
_check_for_bad_pandas_dtypes
(
label
.
dtypes
)
label_array
=
np
.
ravel
(
_pandas_to_numpy
(
label
,
target_dtype
=
np
.
float32
))
try
:
# most common case (no nullable dtypes)
label
=
label
.
to_numpy
(
dtype
=
np
.
float32
,
copy
=
False
)
except
TypeError
:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
label
=
label
.
astype
(
np
.
float32
,
copy
=
False
).
values
except
ValueError
:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
label
=
label
.
to_numpy
(
dtype
=
np
.
float32
,
na_value
=
np
.
nan
)
label_array
=
np
.
ravel
(
label
)
elif
_is_pyarrow_array
(
label
):
elif
_is_pyarrow_array
(
label
):
label_array
=
label
label_array
=
label
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment