Unverified Commit 8f446be7 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python] add sparsity support for new version of pandas and check Series for bad dtypes (#2318)

* reworked pandas dtypes mapper

* added tests

* added sparsity support for new version of pandas

* fixed tests for old pandas

* check pd.Series for bad dtypes as well

* enhanced tests

* fixed pylint
parent 9cf6b828
...@@ -13,7 +13,7 @@ from tempfile import NamedTemporaryFile ...@@ -13,7 +13,7 @@ from tempfile import NamedTemporaryFile
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
from .compat import (PANDAS_INSTALLED, DataFrame, Series, from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
DataTable, DataTable,
decode_string, string_type, decode_string, string_type,
integer_types, numeric_types, integer_types, numeric_types,
...@@ -78,7 +78,12 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): ...@@ -78,7 +78,12 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif is_1d_list(data): elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False) return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, Series): elif isinstance(data, Series):
return data.values.astype(dtype) if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray
return data.values.values.astype(dtype)
else:
return data.values.astype(dtype)
else: else:
raise TypeError("Wrong type({0}) for {1}.\n" raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name)) "It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
...@@ -194,11 +199,6 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32, ...@@ -194,11 +199,6 @@ FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"feature_penalty": C_API_DTYPE_FLOAT64, "feature_penalty": C_API_DTYPE_FLOAT64,
"monotone_constraints": C_API_DTYPE_INT8} "monotone_constraints": C_API_DTYPE_INT8}
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float'}
def convert_from_sliced_object(data): def convert_from_sliced_object(data):
"""Fix the memory of multi-dimensional sliced object.""" """Fix the memory of multi-dimensional sliced object."""
...@@ -252,6 +252,17 @@ def c_int_array(data): ...@@ -252,6 +252,17 @@ def c_int_array(data):
return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed
def _get_bad_pandas_dtypes(dtypes):
pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float'}
bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
and (not is_dtype_sparse(dtype)
or dtype.subtype.name not in pandas_dtype_mapper))]
return bad_indices
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical): def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
if len(data.shape) != 2 or data.shape[0] < 1: if len(data.shape) != 2 or data.shape[0] < 1:
...@@ -280,13 +291,11 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -280,13 +291,11 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
categorical_feature = list(categorical_feature) categorical_feature = list(categorical_feature)
if feature_name == 'auto': if feature_name == 'auto':
feature_name = list(data.columns) feature_name = list(data.columns)
data_dtypes = data.dtypes bad_indices = _get_bad_pandas_dtypes(data.dtypes)
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes): if bad_indices:
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n" raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: " "Did not expect the data types in the following fields: "
+ ', '.join(bad_fields)) + ', '.join(data.columns[bad_indices]))
data = data.values.astype('float') data = data.values.astype('float')
else: else:
if feature_name == 'auto': if feature_name == 'auto':
...@@ -300,8 +309,7 @@ def _label_from_pandas(label): ...@@ -300,8 +309,7 @@ def _label_from_pandas(label):
if isinstance(label, DataFrame): if isinstance(label, DataFrame):
if len(label.columns) > 1: if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns') raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes if _get_bad_pandas_dtypes(label.dtypes):
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool') raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten() label = label.values.astype('float').flatten()
return label return label
......
...@@ -62,6 +62,7 @@ def json_default_with_numpy(obj): ...@@ -62,6 +62,7 @@ def json_default_with_numpy(obj):
"""pandas""" """pandas"""
try: try:
from pandas import Series, DataFrame from pandas import Series, DataFrame
from pandas.api.types import is_sparse as is_dtype_sparse
PANDAS_INSTALLED = True PANDAS_INSTALLED = True
except ImportError: except ImportError:
PANDAS_INSTALLED = False PANDAS_INSTALLED = False
...@@ -76,6 +77,8 @@ except ImportError: ...@@ -76,6 +77,8 @@ except ImportError:
pass pass
is_dtype_sparse = None
"""matplotlib""" """matplotlib"""
try: try:
import matplotlib import matplotlib
......
...@@ -719,6 +719,32 @@ class TestEngine(unittest.TestCase): ...@@ -719,6 +719,32 @@ class TestEngine(unittest.TestCase):
self.assertListEqual(gbm6.pandas_categorical, cat_values) self.assertListEqual(gbm6.pandas_categorical, cat_values)
self.assertListEqual(gbm7.pandas_categorical, cat_values) self.assertListEqual(gbm7.pandas_categorical, cat_values)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
params = {
'objective': 'binary',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train, num_boost_round=10)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)
def test_reference_chain(self): def test_reference_chain(self):
X = np.random.normal(size=(100, 2)) X = np.random.normal(size=(100, 2))
y = np.random.normal(size=100) y = np.random.normal(size=100)
......
...@@ -277,6 +277,27 @@ class TestSklearn(unittest.TestCase): ...@@ -277,6 +277,27 @@ class TestSklearn(unittest.TestCase):
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values) self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm6.booster_.pandas_categorical, cat_values) self.assertListEqual(gbm6.booster_.pandas_categorical, cat_values)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier().fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)
def test_predict(self): def test_predict(self):
iris = load_iris() iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment