Unverified Commit f1856956 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package] add support for pandas nullable types (fixes #4173) (#4927)



* map nullable dtypes to regular float dtypes

* cast x3 to float after introducing missing values

* add test for regular dtypes

* use .astype and then values. update nullable_dtypes test and include test for regular numpy dtypes

* more specific allowed dtypes. test no copy when single float dtype df

* use np.find_common_type. set np.float128 to None when it isn't supported

* set default as type(None)

* move tests that use lgb.train to test_engine

* include np.float32 when finding common dtype

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* add linebreak
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 97c8d945
...@@ -17,8 +17,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Un ...@@ -17,8 +17,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Un
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
from .compat import (PANDAS_INSTALLED, concat, dt_DataTable, is_dtype_sparse, pd_CategoricalDtype, pd_DataFrame, from .compat import PANDAS_INSTALLED, concat, dt_DataTable, pd_CategoricalDtype, pd_DataFrame, pd_Series
pd_Series)
from .libpath import find_lib_path from .libpath import find_lib_path
ZERO_THRESHOLD = 1e-35 ZERO_THRESHOLD = 1e-35
...@@ -502,14 +501,15 @@ def c_int_array(data): ...@@ -502,14 +501,15 @@ def c_int_array(data):
def _get_bad_pandas_dtypes(dtypes): def _get_bad_pandas_dtypes(dtypes):
pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int', float128 = getattr(np, 'float128', type(None))
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int', def is_allowed_numpy_dtype(dtype):
'float16': 'float', 'float32': 'float', 'float64': 'float'} return (
bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper issubclass(dtype, (np.integer, np.floating, np.bool_))
and (not is_dtype_sparse(dtype) and not issubclass(dtype, (np.timedelta64, float128))
or dtype.subtype.name not in pandas_dtype_mapper))] )
return bad_indices
return [i for i, dtype in enumerate(dtypes) if not is_allowed_numpy_dtype(dtype.type)]
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical): def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
...@@ -546,9 +546,10 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -546,9 +546,10 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n" raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: " "Did not expect the data types in the following fields: "
f"{bad_index_cols_str}") f"{bad_index_cols_str}")
data = data.values df_dtypes = [dtype.type for dtype in data.dtypes]
if data.dtype != np.float32 and data.dtype != np.float64: df_dtypes.append(np.float32) # so that the target dtype considers floats
data = data.astype(np.float32) target_dtype = np.find_common_type(df_dtypes, [])
data = data.astype(target_dtype, copy=False).values
else: else:
if feature_name == 'auto': if feature_name == 'auto':
feature_name = None feature_name = None
......
...@@ -6,7 +6,6 @@ try: ...@@ -6,7 +6,6 @@ try:
from pandas import DataFrame as pd_DataFrame from pandas import DataFrame as pd_DataFrame
from pandas import Series as pd_Series from pandas import Series as pd_Series
from pandas import concat from pandas import concat
from pandas.api.types import is_sparse as is_dtype_sparse
try: try:
from pandas import CategoricalDtype as pd_CategoricalDtype from pandas import CategoricalDtype as pd_CategoricalDtype
except ImportError: except ImportError:
...@@ -34,7 +33,6 @@ except ImportError: ...@@ -34,7 +33,6 @@ except ImportError:
pass pass
concat = None concat = None
is_dtype_sparse = None
"""matplotlib""" """matplotlib"""
try: try:
......
...@@ -8,7 +8,7 @@ import numpy as np ...@@ -8,7 +8,7 @@ import numpy as np
import pytest import pytest
from scipy import sparse from scipy import sparse
from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
from sklearn.metrics import log_loss from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import lightgbm as lgb import lightgbm as lgb
...@@ -658,3 +658,15 @@ def test_multiclass_custom_eval(): ...@@ -658,3 +658,15 @@ def test_multiclass_custom_eval():
_, metric, value, _ = bst.eval(ds, key, feval=custom_eval)[1] # first element is multi_logloss _, metric, value, _ = bst.eval(ds, key, feval=custom_eval)[1] # first element is multi_logloss
assert metric == 'custom_logloss' assert metric == 'custom_logloss'
np.testing.assert_allclose(value, eval_result[key][metric][-1]) np.testing.assert_allclose(value, eval_result[key][metric][-1])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
def test_no_copy_when_single_float_dtype_dataframe(dtype):
pd = pytest.importorskip('pandas')
X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X)
# feature names are required to not make a copy (rename makes a copy)
feature_name = ['x1', 'x2']
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)
...@@ -3291,3 +3291,87 @@ def test_record_evaluation_with_cv(train_metric): ...@@ -3291,3 +3291,87 @@ def test_record_evaluation_with_cv(train_metric):
np.testing.assert_allclose( np.testing.assert_allclose(
cv_hist[key], eval_result[dataset][f'{metric}-{agg}'] cv_hist[key], eval_result[dataset][f'{metric}-{agg}']
) )
def test_pandas_with_numpy_regular_dtypes():
pd = pytest.importorskip('pandas')
uints = ['uint8', 'uint16', 'uint32', 'uint64']
ints = ['int8', 'int16', 'int32', 'int64']
bool_and_floats = ['bool', 'float16', 'float32', 'float64']
rng = np.random.RandomState(42)
n_samples = 100
# data as float64
df = pd.DataFrame({
'x1': rng.randint(0, 2, n_samples),
'x2': rng.randint(1, 3, n_samples),
'x3': 10 * rng.randint(1, 3, n_samples),
'x4': 100 * rng.randint(1, 3, n_samples),
})
df = df.astype(np.float64)
y = df['x1'] * (df['x2'] + df['x3'] + df['x4'])
ds = lgb.Dataset(df, y)
params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
bst = lgb.train(params, ds, num_boost_round=5)
preds = bst.predict(df)
# test all features were used
assert bst.trees_to_dataframe()['split_feature'].nunique() == df.shape[1]
# test the score is better than predicting the mean
baseline = np.full_like(y, y.mean())
assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)
# test all predictions are equal using different input dtypes
for target_dtypes in [uints, ints, bool_and_floats]:
df2 = df.astype({f'x{i}': dtype for i, dtype in enumerate(target_dtypes, start=1)})
assert df2.dtypes.tolist() == target_dtypes
ds2 = lgb.Dataset(df2, y)
bst2 = lgb.train(params, ds2, num_boost_round=5)
preds2 = bst2.predict(df2)
np.testing.assert_allclose(preds, preds2)
def test_pandas_nullable_dtypes():
pd = pytest.importorskip('pandas')
rng = np.random.RandomState(0)
df = pd.DataFrame({
'x1': rng.randint(1, 3, size=100),
'x2': np.linspace(-1, 1, 100),
'x3': pd.arrays.SparseArray(rng.randint(0, 11, size=100)),
'x4': rng.rand(100) < 0.5,
})
# introduce some missing values
df.loc[1, 'x1'] = np.nan
df.loc[2, 'x2'] = np.nan
df.loc[3, 'x4'] = np.nan
# the previous line turns x3 into object dtype in recent versions of pandas
df['x4'] = df['x4'].astype(np.float64)
y = df['x1'] * df['x2'] + df['x3'] * (1 + df['x4'])
y = y.fillna(0)
# train with regular dtypes
params = {'objective': 'l2', 'num_leaves': 31, 'min_child_samples': 1}
ds = lgb.Dataset(df, y)
bst = lgb.train(params, ds, num_boost_round=5)
preds = bst.predict(df)
# convert to nullable dtypes
df2 = df.copy()
df2['x1'] = df2['x1'].astype('Int32')
df2['x2'] = df2['x2'].astype('Float64')
df2['x4'] = df2['x4'].astype('boolean')
# test training succeeds
ds_nullable_dtypes = lgb.Dataset(df2, y)
bst_nullable_dtypes = lgb.train(params, ds_nullable_dtypes, num_boost_round=5)
preds_nullable_dtypes = bst_nullable_dtypes.predict(df2)
trees_df = bst_nullable_dtypes.trees_to_dataframe()
# test all features were used
assert trees_df['split_feature'].nunique() == df.shape[1]
# test the score is better than predicting the mean
baseline = np.full_like(y, y.mean())
assert mean_squared_error(y, preds) < mean_squared_error(y, baseline)
# test equal predictions
np.testing.assert_allclose(preds, preds_nullable_dtypes)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment