Unverified Commit 99daacf1 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[python-package] [docs] complete type annotations for scikit-learn fit() methods (#5816)

parent 638014d5
...@@ -10,7 +10,7 @@ import scipy.sparse ...@@ -10,7 +10,7 @@ import scipy.sparse
from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType, from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
_LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
_LGBM_GroupType, _LGBM_LabelType, _log_warning) _LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
from .callback import _EvalResultDict, record_evaluation from .callback import _EvalResultDict, record_evaluation
from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray, from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase, _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
...@@ -83,6 +83,7 @@ _LGBM_ScikitEvalMetricType = Union[ ...@@ -83,6 +83,7 @@ _LGBM_ScikitEvalMetricType = Union[
_LGBM_ScikitCustomEvalFunction, _LGBM_ScikitCustomEvalFunction,
List[Union[str, _LGBM_ScikitCustomEvalFunction]] List[Union[str, _LGBM_ScikitCustomEvalFunction]]
] ]
_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]
class _ObjectiveFunctionWrapper: class _ObjectiveFunctionWrapper:
...@@ -725,15 +726,15 @@ class LGBMModel(_LGBMModelBase): ...@@ -725,15 +726,15 @@ class LGBMModel(_LGBMModelBase):
self, self,
X: _LGBM_ScikitMatrixLike, X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType, y: _LGBM_LabelType,
sample_weight=None, sample_weight: Optional[_LGBM_WeightType] = None,
init_score=None, init_score: Optional[_LGBM_InitScoreType] = None,
group: Optional[_LGBM_GroupType] = None, group: Optional[_LGBM_GroupType] = None,
eval_set=None, eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight=None, eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_class_weight=None, eval_class_weight: Optional[List[float]] = None,
eval_init_score=None, eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group=None, eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto', feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
...@@ -857,12 +858,12 @@ class LGBMModel(_LGBMModelBase): ...@@ -857,12 +858,12 @@ class LGBMModel(_LGBMModelBase):
fit.__doc__ = _lgbmmodel_doc_fit.format( fit.__doc__ = _lgbmmodel_doc_fit.format(
X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]", y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)", group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
eval_sample_weight_shape="list of array, or None, optional (default=None)", eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
eval_init_score_shape="list of array, or None, optional (default=None)", eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
eval_group_shape="list of array, or None, optional (default=None)" eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)"
) + "\n\n" + _lgbmmodel_doc_custom_eval_note ) + "\n\n" + _lgbmmodel_doc_custom_eval_note
def predict( def predict(
...@@ -1021,12 +1022,12 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): ...@@ -1021,12 +1022,12 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
self, self,
X: _LGBM_ScikitMatrixLike, X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType, y: _LGBM_LabelType,
sample_weight=None, sample_weight: Optional[_LGBM_WeightType] = None,
init_score=None, init_score: Optional[_LGBM_InitScoreType] = None,
eval_set=None, eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight=None, eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score=None, eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto', feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
...@@ -1067,13 +1068,13 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): ...@@ -1067,13 +1068,13 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
self, self,
X: _LGBM_ScikitMatrixLike, X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType, y: _LGBM_LabelType,
sample_weight=None, sample_weight: Optional[_LGBM_WeightType] = None,
init_score=None, init_score: Optional[_LGBM_InitScoreType] = None,
eval_set=None, eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight=None, eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_class_weight=None, eval_class_weight: Optional[List[float]] = None,
eval_init_score=None, eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
feature_name: _LGBM_FeatureNameConfiguration = 'auto', feature_name: _LGBM_FeatureNameConfiguration = 'auto',
categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
...@@ -1116,7 +1117,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): ...@@ -1116,7 +1117,7 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
eval_metric = eval_metric_list eval_metric = eval_metric_list
# do not modify args, as it causes errors in model selection tools # do not modify args, as it causes errors in model selection tools
valid_sets: Optional[List[Tuple]] = None valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None
if eval_set is not None: if eval_set is not None:
if isinstance(eval_set, tuple): if isinstance(eval_set, tuple):
eval_set = [eval_set] eval_set = [eval_set]
...@@ -1251,14 +1252,14 @@ class LGBMRanker(LGBMModel): ...@@ -1251,14 +1252,14 @@ class LGBMRanker(LGBMModel):
self, self,
X: _LGBM_ScikitMatrixLike, X: _LGBM_ScikitMatrixLike,
y: _LGBM_LabelType, y: _LGBM_LabelType,
sample_weight=None, sample_weight: Optional[_LGBM_WeightType] = None,
init_score=None, init_score: Optional[_LGBM_InitScoreType] = None,
group: Optional[_LGBM_GroupType] = None, group: Optional[_LGBM_GroupType] = None,
eval_set=None, eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
eval_names: Optional[List[str]] = None, eval_names: Optional[List[str]] = None,
eval_sample_weight=None, eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
eval_init_score=None, eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
eval_group=None, eval_group: Optional[List[_LGBM_GroupType]] = None,
eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5), eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
feature_name: _LGBM_FeatureNameConfiguration = 'auto', feature_name: _LGBM_FeatureNameConfiguration = 'auto',
......
...@@ -1416,7 +1416,17 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data ...@@ -1416,7 +1416,17 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
pytest.skip('pandas is not installed') pytest.skip('pandas is not installed')
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED: if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed') pytest.skip('datatable is not installed')
X, y, g = _create_data(task, n_samples=1_000) X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(np.random.randn(y.shape[0]))
if task == 'binary-classification' or task == 'regression':
init_score = np.full_like(y, np.mean(y))
elif task == 'multiclass-classification':
init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
else:
raise ValueError(f"Unrecognized task '{task}'")
X_valid = X * 2
if X_type == 'dt_DataTable': if X_type == 'dt_DataTable':
X = dt_DataTable(X) X = dt_DataTable(X)
elif X_type == 'list2d': elif X_type == 'list2d':
...@@ -1430,17 +1440,39 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data ...@@ -1430,17 +1440,39 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
elif X_type != 'numpy': elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'") raise ValueError(f"Unrecognized X_type: '{X_type}'")
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d': if y_type == 'list1d':
y = y.tolist() y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame': elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y) y = pd_DataFrame(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type == 'pd_Series': elif y_type == 'pd_Series':
y = pd_Series(y) y = pd_Series(y)
weights = pd_Series(weights)
if task == 'multiclass-classification':
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type != 'numpy': elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'") raise ValueError(f"Unrecognized y_type: '{y_type}'")
model = task_to_model_factory[task](n_estimators=10, verbose=-1) model = task_to_model_factory[task](n_estimators=10, verbose=-1)
model.fit(X, y) model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score]
)
preds = model.predict(X) preds = model.predict(X)
if task == 'binary-classification': if task == 'binary-classification':
...@@ -1462,6 +1494,10 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type ...@@ -1462,6 +1494,10 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED: if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip('datatable is not installed') pytest.skip('datatable is not installed')
X, y, g = _create_data(task='ranking', n_samples=1_000) X, y, g = _create_data(task='ranking', n_samples=1_000)
weights = np.abs(np.random.randn(y.shape[0]))
init_score = np.full_like(y, np.mean(y))
X_valid = X * 2
if X_type == 'dt_DataTable': if X_type == 'dt_DataTable':
X = dt_DataTable(X) X = dt_DataTable(X)
elif X_type == 'list2d': elif X_type == 'list2d':
...@@ -1475,12 +1511,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type ...@@ -1475,12 +1511,20 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
elif X_type != 'numpy': elif X_type != 'numpy':
raise ValueError(f"Unrecognized X_type: '{X_type}'") raise ValueError(f"Unrecognized X_type: '{X_type}'")
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == 'list1d': if y_type == 'list1d':
y = y.tolist() y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == 'pd_DataFrame': elif y_type == 'pd_DataFrame':
y = pd_DataFrame(y) y = pd_DataFrame(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type == 'pd_Series': elif y_type == 'pd_Series':
y = pd_Series(y) y = pd_Series(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type != 'numpy': elif y_type != 'numpy':
raise ValueError(f"Unrecognized y_type: '{y_type}'") raise ValueError(f"Unrecognized y_type: '{y_type}'")
...@@ -1494,6 +1538,16 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type ...@@ -1494,6 +1538,16 @@ def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type
raise ValueError(f"Unrecognized g_type: '{g_type}'") raise ValueError(f"Unrecognized g_type: '{g_type}'")
model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1) model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1)
model.fit(X, y, group=g) model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
group=g,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score],
eval_group=[g]
)
preds = model.predict(X) preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99 assert spearmanr(preds, y).correlation >= 0.99
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment