Commit a7d2b174 authored by Nikita Titov's avatar Nikita Titov Committed by Guolin Ke
Browse files

[python] fixed one-column DataFrame label and clarified input types in docstrings (#1648)

* added test for pandas label of Dataset

* fix when label type is pandas DataFrame; document possible pandas Series type
parent 536f5dde
...@@ -78,7 +78,8 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): ...@@ -78,7 +78,8 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif isinstance(data, Series): elif isinstance(data, Series):
return data.values.astype(dtype) return data.values.astype(dtype)
else: else:
raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name)) raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
def cfloat32_array_to_numpy(cptr, length): def cfloat32_array_to_numpy(cptr, length):
...@@ -288,7 +289,7 @@ def _label_from_pandas(label): ...@@ -288,7 +289,7 @@ def _label_from_pandas(label):
label_dtypes = label.dtypes label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes): if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool') raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float') label = label.values.astype('float').flatten()
return label return label
...@@ -382,7 +383,7 @@ class _InnerPredictor(object): ...@@ -382,7 +383,7 @@ class _InnerPredictor(object):
Parameters Parameters
---------- ----------
data : string/numpy array/scipy.sparse data : string, numpy array, pandas DataFrame or scipy.sparse
Data source for prediction Data source for prediction
When data type is string, it represents the path of txt file When data type is string, it represents the path of txt file
num_iteration : int num_iteration : int
...@@ -621,18 +622,18 @@ class Dataset(object): ...@@ -621,18 +622,18 @@ class Dataset(object):
Parameters Parameters
---------- ----------
data : string, numpy array, scipy.sparse or list of numpy arrays data : string, numpy array, pandas DataFrame, scipy.sparse or list of numpy arrays
Data source of Dataset. Data source of Dataset.
If string, it represents the path to txt file. If string, it represents the path to txt file.
label : list, numpy 1-D array or None, optional (default=None) label : list, numpy 1-D array, pandas one-column DataFrame/Series or None, optional (default=None)
Label of the data. Label of the data.
reference : Dataset or None, optional (default=None) reference : Dataset or None, optional (default=None)
If this is Dataset for validation, training data should be used as reference. If this is Dataset for validation, training data should be used as reference.
weight : list, numpy 1-D array or None, optional (default=None) weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
Weight for each instance. Weight for each instance.
group : list, numpy 1-D array or None, optional (default=None) group : list, numpy 1-D array, pandas Series or None, optional (default=None)
Group/query size for Dataset. Group/query size for Dataset.
init_score : list, numpy 1-D array or None, optional (default=None) init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
Init score for Dataset. Init score for Dataset.
silent : bool, optional (default=False) silent : bool, optional (default=False)
Whether to print messages during construction. Whether to print messages during construction.
...@@ -965,16 +966,16 @@ class Dataset(object): ...@@ -965,16 +966,16 @@ class Dataset(object):
Parameters Parameters
---------- ----------
data : string, numpy array or scipy.sparse data : string, numpy array, pandas DataFrame, scipy.sparse or list of numpy arrays
Data source of Dataset. Data source of Dataset.
If string, it represents the path to txt file. If string, it represents the path to txt file.
label : list or numpy 1-D array, optional (default=None) label : list, numpy 1-D array, pandas one-column DataFrame/Series or None, optional (default=None)
Label of the training data. Label of the data.
weight : list, numpy 1-D array or None, optional (default=None) weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
Weight for each instance. Weight for each instance.
group : list, numpy 1-D array or None, optional (default=None) group : list, numpy 1-D array, pandas Series or None, optional (default=None)
Group/query size for Dataset. Group/query size for Dataset.
init_score : list, numpy 1-D array or None, optional (default=None) init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
Init score for Dataset. Init score for Dataset.
silent : bool, optional (default=False) silent : bool, optional (default=False)
Whether to print messages during construction. Whether to print messages during construction.
...@@ -1055,7 +1056,7 @@ class Dataset(object): ...@@ -1055,7 +1056,7 @@ class Dataset(object):
---------- ----------
field_name : string field_name : string
The field name of the information. The field name of the information.
data : list, numpy array or None data : list, numpy 1-D array, pandas Series or None
The array of data to be set. The array of data to be set.
Returns Returns
...@@ -1228,7 +1229,7 @@ class Dataset(object): ...@@ -1228,7 +1229,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
label : list, numpy array or None label : list, numpy 1-D array, pandas one-column DataFrame/Series or None
The label information to be set into Dataset. The label information to be set into Dataset.
Returns Returns
...@@ -1238,7 +1239,7 @@ class Dataset(object): ...@@ -1238,7 +1239,7 @@ class Dataset(object):
""" """
self.label = label self.label = label
if self.handle is not None: if self.handle is not None:
label = list_to_1d_numpy(label, name='label') label = list_to_1d_numpy(_label_from_pandas(label), name='label')
self.set_field('label', label) self.set_field('label', label)
return self return self
...@@ -1247,7 +1248,7 @@ class Dataset(object): ...@@ -1247,7 +1248,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
weight : list, numpy array or None weight : list, numpy 1-D array, pandas Series or None
Weight to be set for each data point. Weight to be set for each data point.
Returns Returns
...@@ -1268,7 +1269,7 @@ class Dataset(object): ...@@ -1268,7 +1269,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
init_score : list, numpy array or None init_score : list, numpy 1-D array, pandas Series or None
Init score for Booster. Init score for Booster.
Returns Returns
...@@ -1287,7 +1288,7 @@ class Dataset(object): ...@@ -1287,7 +1288,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
group : list, numpy array or None group : list, numpy 1-D array, pandas Series or None
Group size of each group. Group size of each group.
Returns Returns
...@@ -2055,7 +2056,7 @@ class Booster(object): ...@@ -2055,7 +2056,7 @@ class Booster(object):
Parameters Parameters
---------- ----------
data : string, numpy array or scipy.sparse data : string, numpy array, pandas DataFrame or scipy.sparse
Data source for prediction. Data source for prediction.
If string, it represents the path to txt file. If string, it represents the path to txt file.
num_iteration : int or None, optional (default=None) num_iteration : int or None, optional (default=None)
...@@ -2107,10 +2108,10 @@ class Booster(object): ...@@ -2107,10 +2108,10 @@ class Booster(object):
Parameters Parameters
---------- ----------
data : string, numpy array or scipy.sparse data : string, numpy array, pandas DataFrame or scipy.sparse
Data source for refit. Data source for refit.
If string, it represents the path to txt file. If string, it represents the path to txt file.
label : list or numpy 1-D array label : list, numpy 1-D array or pandas one-column DataFrame/Series
Label for refit. Label for refit.
decay_rate : float, optional (default=0.9) decay_rate : float, optional (default=0.9)
Decay rate of refit, will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees. Decay rate of refit, will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees.
......
...@@ -495,11 +495,11 @@ class TestEngine(unittest.TestCase): ...@@ -495,11 +495,11 @@ class TestEngine(unittest.TestCase):
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False) gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
pred0 = list(gbm0.predict(X_test)) pred0 = list(gbm0.predict(X_test))
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=[0]) categorical_feature=[0])
pred1 = list(gbm1.predict(X_test)) pred1 = list(gbm1.predict(X_test))
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False, gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A']) categorical_feature=['A'])
pred2 = list(gbm2.predict(X_test)) pred2 = list(gbm2.predict(X_test))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment