"python-package/vscode:/vscode.git/clone" did not exist on "7e34d23c05599ce3a8a6f22cdba29e103f57d218"
Unverified Commit f1f5ba15 authored by José Morales's avatar José Morales Committed by GitHub
Browse files

[python-package] Support 2d collections as input for `init_score` in...


[python-package] Support 2d collections as input for `init_score` in multiclass classification task (#4150)

* initial implementation of init_score for multiclass classification

* check for 1d or 2d collection in init_score

* remove dataset import

* initial comments

* update dask test and docstrings

* update docstrings

* move logic to set_field. reshape back on get_field

* add type hints and update docstrings for dask. fix Dataset.set_field

* revert wrong docstrings and type hints

* add extra comma for consistency

* prefix private functions with underscore

add type hints to new functions

make commas consistent in dask and basic

* add missing spaces after type hint

* remove shape condition for dataframe in is_2d_collection
Co-authored-by: default avatarNikita Titov <nekit94-12@hotmail.com>
parent eda0d3ca
...@@ -149,8 +149,8 @@ def is_numpy_column_array(data): ...@@ -149,8 +149,8 @@ def is_numpy_column_array(data):
return len(shape) == 2 and shape[1] == 1 return len(shape) == 2 and shape[1] == 1
def cast_numpy_1d_array_to_dtype(array, dtype): def cast_numpy_array_to_dtype(array, dtype):
"""Cast numpy 1d array to given dtype.""" """Cast numpy array to given dtype."""
if array.dtype == dtype: if array.dtype == dtype:
return array return array
return array.astype(dtype=dtype, copy=False) return array.astype(dtype=dtype, copy=False)
...@@ -161,14 +161,24 @@ def is_1d_list(data): ...@@ -161,14 +161,24 @@ def is_1d_list(data):
return isinstance(data, list) and (not data or is_numeric(data[0])) return isinstance(data, list) and (not data or is_numeric(data[0]))
def _is_1d_collection(data: Any) -> bool:
"""Check whether data is a 1-D collection."""
return (
is_numpy_1d_array(data)
or is_numpy_column_array(data)
or is_1d_list(data)
or isinstance(data, pd_Series)
)
def list_to_1d_numpy(data, dtype=np.float32, name='list'): def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""Convert data to numpy 1-D array.""" """Convert data to numpy 1-D array."""
if is_numpy_1d_array(data): if is_numpy_1d_array(data):
return cast_numpy_1d_array_to_dtype(data, dtype) return cast_numpy_array_to_dtype(data, dtype)
elif is_numpy_column_array(data): elif is_numpy_column_array(data):
_log_warning('Converting column-vector to 1d array') _log_warning('Converting column-vector to 1d array')
array = data.ravel() array = data.ravel()
return cast_numpy_1d_array_to_dtype(array, dtype) return cast_numpy_array_to_dtype(array, dtype)
elif is_1d_list(data): elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False) return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, pd_Series): elif isinstance(data, pd_Series):
...@@ -180,6 +190,39 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'): ...@@ -180,6 +190,39 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"It should be list, numpy 1-D array or pandas Series") "It should be list, numpy 1-D array or pandas Series")
def _is_numpy_2d_array(data: Any) -> bool:
"""Check whether data is a numpy 2-D array."""
return isinstance(data, np.ndarray) and len(data.shape) == 2 and data.shape[1] > 1
def _is_2d_list(data: Any) -> bool:
"""Check whether data is a 2-D list."""
return isinstance(data, list) and len(data) > 0 and is_1d_list(data[0])
def _is_2d_collection(data: Any) -> bool:
"""Check whether data is a 2-D collection."""
return (
_is_numpy_2d_array(data)
or _is_2d_list(data)
or isinstance(data, pd_DataFrame)
)
def _data_to_2d_numpy(data: Any, dtype: type = np.float32, name: str = 'list') -> np.ndarray:
"""Convert data to numpy 2-D array."""
if _is_numpy_2d_array(data):
return cast_numpy_array_to_dtype(data, dtype)
if _is_2d_list(data):
return np.array(data, dtype=dtype)
if isinstance(data, pd_DataFrame):
if _get_bad_pandas_dtypes(data.dtypes):
raise ValueError('DataFrame.dtypes must be int, float or bool')
return cast_numpy_array_to_dtype(data.values, dtype)
raise TypeError(f"Wrong type({type(data).__name__}) for {name}.\n"
"It should be list of lists, numpy 2-D array or pandas DataFrame")
def cfloat32_array_to_numpy(cptr, length): def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.""" """Convert a ctypes float pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)): if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
...@@ -1145,7 +1188,7 @@ class Dataset: ...@@ -1145,7 +1188,7 @@ class Dataset:
sum(group) = n_samples. sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
Init score for Dataset. Init score for Dataset.
silent : bool, optional (default=False) silent : bool, optional (default=False)
Whether to print messages during construction. Whether to print messages during construction.
...@@ -1792,7 +1835,7 @@ class Dataset: ...@@ -1792,7 +1835,7 @@ class Dataset:
sum(group) = n_samples. sum(group) = n_samples.
For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
init_score : list, numpy 1-D array, pandas Series or None, optional (default=None) init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
Init score for Dataset. Init score for Dataset.
silent : bool, optional (default=False) silent : bool, optional (default=False)
Whether to print messages during construction. Whether to print messages during construction.
...@@ -1899,8 +1942,8 @@ class Dataset: ...@@ -1899,8 +1942,8 @@ class Dataset:
---------- ----------
field_name : str field_name : str
The field name of the information. The field name of the information.
data : list, numpy 1-D array, pandas Series or None data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
The array of data to be set. The data to be set.
Returns Returns
------- -------
...@@ -1918,12 +1961,22 @@ class Dataset: ...@@ -1918,12 +1961,22 @@ class Dataset:
ctypes.c_int(0), ctypes.c_int(0),
ctypes.c_int(FIELD_TYPE_MAPPER[field_name]))) ctypes.c_int(FIELD_TYPE_MAPPER[field_name])))
return self return self
dtype = np.float32 if field_name == 'init_score':
if field_name == 'group':
dtype = np.int32
elif field_name == 'init_score':
dtype = np.float64 dtype = np.float64
if _is_1d_collection(data):
data = list_to_1d_numpy(data, dtype, name=field_name)
elif _is_2d_collection(data):
data = _data_to_2d_numpy(data, dtype, name=field_name)
data = data.ravel(order='F')
else:
raise TypeError(
'init_score must be list, numpy 1-D array or pandas Series.\n'
'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.'
)
else:
dtype = np.int32 if field_name == 'group' else np.float32
data = list_to_1d_numpy(data, dtype, name=field_name) data = list_to_1d_numpy(data, dtype, name=field_name)
if data.dtype == np.float32 or data.dtype == np.float64: if data.dtype == np.float32 or data.dtype == np.float64:
ptr_data, type_data, _ = c_float_array(data) ptr_data, type_data, _ = c_float_array(data)
elif data.dtype == np.int32: elif data.dtype == np.int32:
...@@ -1970,13 +2023,19 @@ class Dataset: ...@@ -1970,13 +2023,19 @@ class Dataset:
if tmp_out_len.value == 0: if tmp_out_len.value == 0:
return None return None
if out_type.value == C_API_DTYPE_INT32: if out_type.value == C_API_DTYPE_INT32:
return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value) arr = cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT32: elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) arr = cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT64: elif out_type.value == C_API_DTYPE_FLOAT64:
return cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value) arr = cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
else: else:
raise TypeError("Unknown type") raise TypeError("Unknown type")
if field_name == 'init_score':
num_data = self.num_data()
num_classes = arr.size // num_data
if num_classes > 1:
arr = arr.reshape((num_data, num_classes), order='F')
return arr
def set_categorical_feature(self, categorical_feature): def set_categorical_feature(self, categorical_feature):
"""Set categorical features. """Set categorical features.
...@@ -2128,7 +2187,7 @@ class Dataset: ...@@ -2128,7 +2187,7 @@ class Dataset:
Parameters Parameters
---------- ----------
init_score : list, numpy 1-D array, pandas Series or None init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
Init score for Booster. Init score for Booster.
Returns Returns
...@@ -2138,7 +2197,6 @@ class Dataset: ...@@ -2138,7 +2197,6 @@ class Dataset:
""" """
self.init_score = init_score self.init_score = init_score
if self.handle is not None and init_score is not None: if self.handle is not None and init_score is not None:
init_score = list_to_1d_numpy(init_score, np.float64, name='init_score')
self.set_field('init_score', init_score) self.set_field('init_score', init_score)
self.init_score = self.get_field('init_score') # original values can be modified at cpp side self.init_score = self.get_field('init_score') # original values can be modified at cpp side
return self return self
......
...@@ -392,13 +392,13 @@ def _train( ...@@ -392,13 +392,13 @@ def _train(
params: Dict[str, Any], params: Dict[str, Any],
model_factory: Type[LGBMModel], model_factory: Type[LGBMModel],
sample_weight: Optional[_DaskVectorLike] = None, sample_weight: Optional[_DaskVectorLike] = None,
init_score: Optional[_DaskVectorLike] = None, init_score: Optional[_DaskCollection] = None,
group: Optional[_DaskVectorLike] = None, group: Optional[_DaskVectorLike] = None,
eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_sample_weight: Optional[List[_DaskVectorLike]] = None,
eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_class_weight: Optional[List[Union[dict, str]]] = None,
eval_init_score: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskCollection]] = None,
eval_group: Optional[List[_DaskVectorLike]] = None, eval_group: Optional[List[_DaskVectorLike]] = None,
eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None,
eval_at: Optional[Iterable[int]] = None, eval_at: Optional[Iterable[int]] = None,
...@@ -420,7 +420,7 @@ def _train( ...@@ -420,7 +420,7 @@ def _train(
Class of the local underlying model. Class of the local underlying model.
sample_weight : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None) sample_weight : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)
Weights of training data. Weights of training data.
init_score : Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None) init_score : Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None)
Init score of training data. Init score of training data.
group : Dask Array or Dask Series or None, optional (default=None) group : Dask Array or Dask Series or None, optional (default=None)
Group/query data. Group/query data.
...@@ -439,7 +439,7 @@ def _train( ...@@ -439,7 +439,7 @@ def _train(
Weights for each validation set in eval_set. Weights for each validation set in eval_set.
eval_class_weight : list of dict or str, or None, optional (default=None) eval_class_weight : list of dict or str, or None, optional (default=None)
Class weights, one dict or str for each validation set in eval_set. Class weights, one dict or str for each validation set in eval_set.
eval_init_score : list of Dask Array or Dask Series, or None, optional (default=None) eval_init_score : list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)
Initial model score for each validation set in eval_set. Initial model score for each validation set in eval_set.
eval_group : list of Dask Array or Dask Series, or None, optional (default=None) eval_group : list of Dask Array or Dask Series, or None, optional (default=None)
Group/query for each validation set in eval_set. Group/query for each validation set in eval_set.
...@@ -1021,13 +1021,13 @@ class _DaskLGBMModel: ...@@ -1021,13 +1021,13 @@ class _DaskLGBMModel:
X: _DaskMatrixLike, X: _DaskMatrixLike,
y: _DaskCollection, y: _DaskCollection,
sample_weight: Optional[_DaskVectorLike] = None, sample_weight: Optional[_DaskVectorLike] = None,
init_score: Optional[_DaskVectorLike] = None, init_score: Optional[_DaskCollection] = None,
group: Optional[_DaskVectorLike] = None, group: Optional[_DaskVectorLike] = None,
eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_sample_weight: Optional[List[_DaskVectorLike]] = None,
eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_class_weight: Optional[List[Union[dict, str]]] = None,
eval_init_score: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskCollection]] = None,
eval_group: Optional[List[_DaskVectorLike]] = None, eval_group: Optional[List[_DaskVectorLike]] = None,
eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None,
eval_at: Optional[Iterable[int]] = None, eval_at: Optional[Iterable[int]] = None,
...@@ -1159,12 +1159,12 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -1159,12 +1159,12 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X: _DaskMatrixLike, X: _DaskMatrixLike,
y: _DaskCollection, y: _DaskCollection,
sample_weight: Optional[_DaskVectorLike] = None, sample_weight: Optional[_DaskVectorLike] = None,
init_score: Optional[_DaskVectorLike] = None, init_score: Optional[_DaskCollection] = None,
eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None, eval_set: Optional[List[Tuple[_DaskMatrixLike, _DaskCollection]]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight: Optional[List[_DaskVectorLike]] = None, eval_sample_weight: Optional[List[_DaskVectorLike]] = None,
eval_class_weight: Optional[List[Union[dict, str]]] = None, eval_class_weight: Optional[List[Union[dict, str]]] = None,
eval_init_score: Optional[List[_DaskVectorLike]] = None, eval_init_score: Optional[List[_DaskCollection]] = None,
eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None, eval_metric: Optional[Union[Callable, str, List[Union[Callable, str]]]] = None,
early_stopping_rounds: Optional[int] = None, early_stopping_rounds: Optional[int] = None,
**kwargs: Any **kwargs: Any
...@@ -1192,10 +1192,10 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel): ...@@ -1192,10 +1192,10 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]", X_shape="Dask Array or Dask DataFrame of shape = [n_samples, n_features]",
y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]", y_shape="Dask Array, Dask DataFrame or Dask Series of shape = [n_samples]",
sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)",
init_score_shape="Dask Array or Dask Series of shape = [n_samples] or None, optional (default=None)", init_score_shape="Dask Array or Dask Series of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task), or Dask Array or Dask DataFrame of shape = [n_samples, n_classes] (for multi-class task), or None, optional (default=None)",
group_shape="Dask Array or Dask Series or None, optional (default=None)", group_shape="Dask Array or Dask Series or None, optional (default=None)",
eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)", eval_sample_weight_shape="list of Dask Array or Dask Series, or None, optional (default=None)",
eval_init_score_shape="list of Dask Array or Dask Series, or None, optional (default=None)", eval_init_score_shape="list of Dask Array, Dask Series or Dask DataFrame (for multi-class task), or None, optional (default=None)",
eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)" eval_group_shape="list of Dask Array or Dask Series, or None, optional (default=None)"
) )
......
...@@ -778,7 +778,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -778,7 +778,7 @@ class LGBMModel(_LGBMModelBase):
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
y_shape="array-like of shape = [n_samples]", y_shape="array-like of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or None, optional (default=None)", init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="array-like or None, optional (default=None)", group_shape="array-like or None, optional (default=None)",
eval_sample_weight_shape="list of array, or None, optional (default=None)", eval_sample_weight_shape="list of array, or None, optional (default=None)",
eval_init_score_shape="list of array, or None, optional (default=None)", eval_init_score_shape="list of array, or None, optional (default=None)",
......
...@@ -10,7 +10,7 @@ from sklearn.datasets import dump_svmlight_file, load_svmlight_file ...@@ -10,7 +10,7 @@ from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import lightgbm as lgb import lightgbm as lgb
from lightgbm.compat import PANDAS_INSTALLED, pd_Series from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
from .utils import load_breast_cancer from .utils import load_breast_cancer
...@@ -538,3 +538,18 @@ def test_list_to_1d_numpy(y, dtype): ...@@ -538,3 +538,18 @@ def test_list_to_1d_numpy(y, dtype):
result = lgb.basic.list_to_1d_numpy(y, dtype=dtype) result = lgb.basic.list_to_1d_numpy(y, dtype=dtype)
assert result.size == 10 assert result.size == 10
assert result.dtype == dtype assert result.dtype == dtype
@pytest.mark.parametrize('init_score_type', ['array', 'dataframe', 'list'])
def test_init_score_for_multiclass_classification(init_score_type):
init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
if init_score_type == 'array':
init_score = np.array(init_score)
elif init_score_type == 'dataframe':
if not PANDAS_INSTALLED:
pytest.skip('Pandas is not installed.')
init_score = pd_DataFrame(init_score)
data = np.random.rand(10, 2)
ds = lgb.Dataset(data, init_score=init_score).construct()
np.testing.assert_equal(ds.get_field('init_score'), init_score)
np.testing.assert_equal(ds.init_score, init_score)
...@@ -1582,17 +1582,14 @@ def test_init_score(task, output, cluster): ...@@ -1582,17 +1582,14 @@ def test_init_score(task, output, cluster):
'time_out': 5 'time_out': 5
} }
init_score = random.random() init_score = random.random()
# init_scores must be a 1D array, even for multiclass classification
# where you need to provide 1 score per class for each row in X
# https://github.com/microsoft/LightGBM/issues/4046
size_factor = 1 size_factor = 1
if task == 'multiclass-classification': if task == 'multiclass-classification':
size_factor = 3 # number of classes size_factor = 3 # number of classes
if output.startswith('dataframe'): if output.startswith('dataframe'):
init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor)) init_scores = dy.map_partitions(lambda x: pd.DataFrame([[init_score] * size_factor] * x.size))
else: else:
init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor)) init_scores = dy.map_blocks(lambda x: np.full((x.size, size_factor), init_score))
model = model_factory(client=client, **params) model = model_factory(client=client, **params)
model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
# value of the root node is 0 when init_score is set # value of the root node is 0 when init_score is set
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment