Unverified Commit 8f446be7 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] add sparsity support for new version of pandas and check Series for bad dtypes (#2318)

* reworked pandas dtypes mapper

* added tests

* added sparsity support for new version of pandas

* fixed tests for old pandas

* check pd.Series for bad dtypes as well

* enhanced tests

* fixed pylint
parent 9cf6b828
......@@ -13,7 +13,7 @@ from tempfile import NamedTemporaryFile
import numpy as np
import scipy.sparse
from .compat import (PANDAS_INSTALLED, DataFrame, Series,
from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
DataTable,
decode_string, string_type,
integer_types, numeric_types,
......@@ -78,6 +78,11 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, Series):
if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray
return data.values.values.astype(dtype)
else:
return data.values.astype(dtype)
else:
raise TypeError("Wrong type({0}) for {1}.\n"
......@@ -194,11 +199,6 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"feature_penalty": C_API_DTYPE_FLOAT64,
"monotone_constraints": C_API_DTYPE_INT8}
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float'}
def convert_from_sliced_object(data):
"""Fix the memory of multi-dimensional sliced object."""
......@@ -252,6 +252,17 @@ def c_int_array(data):
return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed
def _get_bad_pandas_dtypes(dtypes):
pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float'}
bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
and (not is_dtype_sparse(dtype)
or dtype.subtype.name not in pandas_dtype_mapper))]
return bad_indices
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
if len(data.shape) != 2 or data.shape[0] < 1:
......@@ -280,13 +291,11 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
categorical_feature = list(categorical_feature)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
bad_indices = _get_bad_pandas_dtypes(data.dtypes)
if bad_indices:
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: "
+ ', '.join(bad_fields))
+ ', '.join(data.columns[bad_indices]))
data = data.values.astype('float')
else:
if feature_name == 'auto':
......@@ -300,8 +309,7 @@ def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
if _get_bad_pandas_dtypes(label.dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten()
return label
......
......@@ -62,6 +62,7 @@ def json_default_with_numpy(obj):
"""pandas"""
try:
from pandas import Series, DataFrame
from pandas.api.types import is_sparse as is_dtype_sparse
PANDAS_INSTALLED = True
except ImportError:
PANDAS_INSTALLED = False
......@@ -76,6 +77,8 @@ except ImportError:
pass
is_dtype_sparse = None
"""matplotlib"""
try:
import matplotlib
......
......@@ -719,6 +719,32 @@ class TestEngine(unittest.TestCase):
self.assertListEqual(gbm6.pandas_categorical, cat_values)
self.assertListEqual(gbm7.pandas_categorical, cat_values)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
params = {
'objective': 'binary',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train, num_boost_round=10)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)
def test_reference_chain(self):
X = np.random.normal(size=(100, 2))
y = np.random.normal(size=100)
......
......@@ -277,6 +277,27 @@ class TestSklearn(unittest.TestCase):
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm6.booster_.pandas_categorical, cat_values)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier().fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)
def test_predict(self):
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment