Unverified Commit 9f035100 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[python-packages] [docs] add type hints and define 'array-like' for X, y,...

[python-packages] [docs] add type hints and define 'array-like' for X, y, group in scikit-learn interface (#5757)
parent 1113cb7a
...@@ -72,7 +72,8 @@ _LGBM_TrainDataType = Union[ ...@@ -72,7 +72,8 @@ _LGBM_TrainDataType = Union[
List[np.ndarray] List[np.ndarray]
] ]
_LGBM_LabelType = Union[ _LGBM_LabelType = Union[
list, List[float],
List[int],
np.ndarray, np.ndarray,
pd_Series, pd_Series,
pd_DataFrame pd_DataFrame
......
...@@ -6,10 +6,11 @@ from pathlib import Path ...@@ -6,10 +6,11 @@ from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import scipy.sparse
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType, from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
_log_warning) _LGBM_GroupType, _LGBM_LabelType, _log_warning)
from .callback import _EvalResultDict, record_evaluation from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray, from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase, _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
...@@ -24,6 +25,13 @@ __all__ = [ ...@@ -24,6 +25,13 @@ __all__ = [
'LGBMRegressor', 'LGBMRegressor',
] ]
_LGBM_ScikitMatrixLike = Union[
dt_DataTable,
List[Union[List[float], List[int]]],
np.ndarray,
pd_DataFrame,
scipy.sparse.spmatrix
]
_LGBM_ScikitCustomObjectiveFunction = Union[ _LGBM_ScikitCustomObjectiveFunction = Union[
Callable[ Callable[
[np.ndarray, np.ndarray], [np.ndarray, np.ndarray],
...@@ -697,11 +705,11 @@ class LGBMModel(_LGBMModelBase): ...@@ -697,11 +705,11 @@ class LGBMModel(_LGBMModelBase):
def fit( def fit(
self, self,
X, X: _LGBM_ScikitMatrixLike,
y, y: _LGBM_LabelType,
sample_weight=None, sample_weight=None,
init_score=None, init_score=None,
group=None, group: Optional[_LGBM_GroupType] = None,
eval_set=None, eval_set=None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight=None, eval_sample_weight=None,
...@@ -829,11 +837,11 @@ class LGBMModel(_LGBMModelBase): ...@@ -829,11 +837,11 @@ class LGBMModel(_LGBMModelBase):
return self return self
fit.__doc__ = _lgbmmodel_doc_fit.format( fit.__doc__ = _lgbmmodel_doc_fit.format(
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="array-like of shape = [n_samples]", y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="array-like or None, optional (default=None)", group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
eval_sample_weight_shape="list of array, or None, optional (default=None)", eval_sample_weight_shape="list of array, or None, optional (default=None)",
eval_init_score_shape="list of array, or None, optional (default=None)", eval_init_score_shape="list of array, or None, optional (default=None)",
eval_group_shape="list of array, or None, optional (default=None)" eval_group_shape="list of array, or None, optional (default=None)"
...@@ -841,7 +849,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -841,7 +849,7 @@ class LGBMModel(_LGBMModelBase):
def predict( def predict(
self, self,
X, X: _LGBM_ScikitMatrixLike,
raw_score: bool = False, raw_score: bool = False,
start_iteration: int = 0, start_iteration: int = 0,
num_iteration: Optional[int] = None, num_iteration: Optional[int] = None,
...@@ -889,7 +897,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -889,7 +897,7 @@ class LGBMModel(_LGBMModelBase):
predict.__doc__ = _lgbmmodel_doc_predict.format( predict.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted value for each sample.", description="Return the predicted value for each sample.",
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
output_name="predicted_result", output_name="predicted_result",
predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
...@@ -993,8 +1001,8 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): ...@@ -993,8 +1001,8 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
def fit( # type: ignore[override] def fit( # type: ignore[override]
self, self,
X, X: _LGBM_ScikitMatrixLike,
y, y: _LGBM_LabelType,
sample_weight=None, sample_weight=None,
init_score=None, init_score=None,
eval_set=None, eval_set=None,
...@@ -1039,8 +1047,8 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): ...@@ -1039,8 +1047,8 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
def fit( # type: ignore[override] def fit( # type: ignore[override]
self, self,
X, X: _LGBM_ScikitMatrixLike,
y, y: _LGBM_LabelType,
sample_weight=None, sample_weight=None,
init_score=None, init_score=None,
eval_set=None, eval_set=None,
...@@ -1127,7 +1135,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): ...@@ -1127,7 +1135,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
def predict( def predict(
self, self,
X, X: _LGBM_ScikitMatrixLike,
raw_score: bool = False, raw_score: bool = False,
start_iteration: int = 0, start_iteration: int = 0,
num_iteration: Optional[int] = None, num_iteration: Optional[int] = None,
...@@ -1157,7 +1165,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): ...@@ -1157,7 +1165,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
def predict_proba( def predict_proba(
self, self,
X, X: _LGBM_ScikitMatrixLike,
raw_score: bool = False, raw_score: bool = False,
start_iteration: int = 0, start_iteration: int = 0,
num_iteration: Optional[int] = None, num_iteration: Optional[int] = None,
...@@ -1189,7 +1197,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): ...@@ -1189,7 +1197,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
predict_proba.__doc__ = _lgbmmodel_doc_predict.format( predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
description="Return the predicted probability for each class for each sample.", description="Return the predicted probability for each class for each sample.",
X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
output_name="predicted_probability", output_name="predicted_probability",
predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
...@@ -1223,11 +1231,11 @@ class LGBMRanker(LGBMModel): ...@@ -1223,11 +1231,11 @@ class LGBMRanker(LGBMModel):
def fit( # type: ignore[override] def fit( # type: ignore[override]
self, self,
X, X: _LGBM_ScikitMatrixLike,
y, y: _LGBM_LabelType,
sample_weight=None, sample_weight=None,
init_score=None, init_score=None,
group=None, group: Optional[_LGBM_GroupType] = None,
eval_set=None, eval_set=None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight=None, eval_sample_weight=None,
......
...@@ -9,17 +9,19 @@ from pathlib import Path ...@@ -9,17 +9,19 @@ from pathlib import Path
import joblib import joblib
import numpy as np import numpy as np
import pytest import pytest
import scipy.sparse
from scipy.stats import spearmanr
from sklearn.base import clone from sklearn.base import clone
from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
from sklearn.ensemble import StackingClassifier, StackingRegressor from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.metrics import log_loss, mean_squared_error from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain
from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.estimator_checks import parametrize_with_checks
from sklearn.utils.validation import check_is_fitted from sklearn.utils.validation import check_is_fitted
import lightgbm as lgb import lightgbm as lgb
from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression, from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression,
sklearn_multiclass_custom_objective, softmax) sklearn_multiclass_custom_objective, softmax)
...@@ -27,20 +29,27 @@ from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, m ...@@ -27,20 +29,27 @@ from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, m
decreasing_generator = itertools.count(0, -1) decreasing_generator = itertools.count(0, -1)
task_to_model_factory = { task_to_model_factory = {
'ranking': lgb.LGBMRanker, 'ranking': lgb.LGBMRanker,
'classification': lgb.LGBMClassifier, 'binary-classification': lgb.LGBMClassifier,
'multiclass-classification': lgb.LGBMClassifier,
'regression': lgb.LGBMRegressor, 'regression': lgb.LGBMRegressor,
} }
def _create_data(task): def _create_data(task, n_samples=100, n_features=4):
if task == 'ranking': if task == 'ranking':
X, y, g = make_ranking(n_features=4) X, y, g = make_ranking(n_features=4, n_samples=n_samples)
g = np.bincount(g) g = np.bincount(g)
elif task == 'classification': elif task.endswith('classification'):
X, y = load_iris(return_X_y=True) if task == 'binary-classification':
centers = 2
elif task == 'multiclass-classification':
centers = 3
else:
ValueError(f"Unknown classification task '{task}'")
X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42)
g = None g = None
elif task == 'regression': elif task == 'regression':
X, y = make_synthetic_regression() X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
g = None g = None
return X, y, g return X, y, g
...@@ -1268,7 +1277,7 @@ def test_sklearn_integration(estimator, check): ...@@ -1268,7 +1277,7 @@ def test_sklearn_integration(estimator, check):
check(estimator) check(estimator)
@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression']) @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression'])
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task): def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
pd = pytest.importorskip("pandas") pd = pytest.importorskip("pandas")
X, y, g = _create_data(task) X, y, g = _create_data(task)
...@@ -1378,9 +1387,9 @@ def test_default_n_jobs(tmp_path): ...@@ -1378,9 +1387,9 @@ def test_default_n_jobs(tmp_path):
@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed') @pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression']) @pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression'])
def test_validate_features(task): def test_validate_features(task):
X, y, g = _create_data(task) X, y, g = _create_data(task, n_features=4)
features = ['x1', 'x2', 'x3', 'x4'] features = ['x1', 'x2', 'x3', 'x4']
df = pd_DataFrame(X, columns=features) df = pd_DataFrame(X, columns=features)
model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1) model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1)
...@@ -1397,3 +1406,94 @@ def test_validate_features(task): ...@@ -1397,3 +1406,94 @@ def test_validate_features(task):
# check that disabling the check doesn't raise the error # check that disabling the check doesn't raise the error
model.predict(df2, validate_features=False) model.predict(df2, validate_features=False)
@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame'])
@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_Series', 'pd_DataFrame'])
@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'regression'])
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task):
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
pytest.skip('pandas is not installed')
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task, n_samples=1_000)
if X_type == 'dt_DataTable':
X = dt_DataTable(X)
elif X_type == 'list2d':
X = X.tolist()
elif X_type == 'scipy_csc':
X = scipy.sparse.csc_matrix(X)
elif X_type == 'scipy_csr':
X = scipy.sparse.csr_matrix(X)
elif X_type == 'pd_DataFrame':
X = pd_DataFrame(X)
elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'")
if y_type == 'list1d':
y = y.tolist()
elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y)
elif y_type == 'pd_Series':
y = pd_Series(y)
elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'")
model = task_to_model_factory[task](n_estimators=10, verbose=-1)
model.fit(X, y)
preds = model.predict(X)
if task == 'binary-classification':
assert accuracy_score(y, preds) >= 0.99
elif task == 'multiclass-classification':
assert accuracy_score(y, preds) >= 0.99
elif task == 'regression':
assert r2_score(y, preds) > 0.86
else:
raise ValueError(f"Unrecognized task: '{task}'")
@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame'])
@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_DataFrame', 'pd_Series'])
@pytest.mark.parametrize('g_type', ['list1d_float', 'list1d_int', 'numpy', 'pd_Series'])
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type):
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
pytest.skip('pandas is not installed')
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed')
X, y, g = _create_data(task='ranking', n_samples=1_000)
if X_type == 'dt_DataTable':
X = dt_DataTable(X)
elif X_type == 'list2d':
X = X.tolist()
elif X_type == 'scipy_csc':
X = scipy.sparse.csc_matrix(X)
elif X_type == 'scipy_csr':
X = scipy.sparse.csr_matrix(X)
elif X_type == 'pd_DataFrame':
X = pd_DataFrame(X)
elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'")
if y_type == 'list1d':
y = y.tolist()
elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y)
elif y_type == 'pd_Series':
y = pd_Series(y)
elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'")
if g_type == 'list1d_float':
g = g.astype("float").tolist()
elif g_type == 'list1d_int':
g = g.astype("int").tolist()
elif g_type == 'pd_Series':
g = pd_Series(g)
elif g_type != 'numpy':
raise ValueError(f"Unrecognized g_type: '{g_type}'")
model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1)
model.fit(X, y, group=g)
preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment