Unverified Commit 7eae66a7 authored by vnherdeiro's avatar vnherdeiro Committed by GitHub
Browse files

[python-package] require `scikit-learn>=0.24.2`, make scikit-learn estimators...


[python-package] require `scikit-learn>=0.24.2`, make scikit-learn estimators compatible with `scikit-learn>=1.6.0dev` (#6651)
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 06432300
...@@ -22,7 +22,7 @@ python -m pip install \ ...@@ -22,7 +22,7 @@ python -m pip install \
'numpy>=2.0.0.dev0' \ 'numpy>=2.0.0.dev0' \
'matplotlib>=3.10.0.dev0' \ 'matplotlib>=3.10.0.dev0' \
'pandas>=3.0.0.dev0' \ 'pandas>=3.0.0.dev0' \
'scikit-learn==1.5.*' \ 'scikit-learn>=1.6.dev0' \
'scipy>=1.15.0.dev0' 'scipy>=1.15.0.dev0'
python -m pip install \ python -m pip install \
......
...@@ -15,7 +15,7 @@ pip install \ ...@@ -15,7 +15,7 @@ pip install \
'numpy==1.19.0' \ 'numpy==1.19.0' \
'pandas==1.1.3' \ 'pandas==1.1.3' \
'pyarrow==6.0.1' \ 'pyarrow==6.0.1' \
'scikit-learn==0.24.0' \ 'scikit-learn==0.24.2' \
'scipy==1.6.0' \ 'scipy==1.6.0' \
|| exit 1 || exit 1
echo "done installing lightgbm's dependencies" echo "done installing lightgbm's dependencies"
......
...@@ -103,6 +103,7 @@ if [[ $TASK == "lint" ]]; then ...@@ -103,6 +103,7 @@ if [[ $TASK == "lint" ]]; then
'mypy>=1.11.1' \ 'mypy>=1.11.1' \
'pre-commit>=3.8.0' \ 'pre-commit>=3.8.0' \
'pyarrow-core>=17.0' \ 'pyarrow-core>=17.0' \
'scikit-learn>=1.5.2' \
'r-lintr>=3.1.2' 'r-lintr>=3.1.2'
source activate $CONDA_ENV source activate $CONDA_ENV
echo "Linting Python code" echo "Linting Python code"
......
# coding: utf-8 # coding: utf-8
"""Compatibility library.""" """Compatibility library."""
from typing import Any, List from typing import TYPE_CHECKING, Any, List
# scikit-learn is intentionally imported first here, # scikit-learn is intentionally imported first here,
# see https://github.com/microsoft/LightGBM/issues/6509 # see https://github.com/microsoft/LightGBM/issues/6509
"""sklearn""" """sklearn"""
try: try:
from sklearn import __version__ as _sklearn_version
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight from sklearn.utils.class_weight import compute_sample_weight
...@@ -29,6 +30,74 @@ try: ...@@ -29,6 +30,74 @@ try:
check_consistent_length(sample_weight, X) check_consistent_length(sample_weight, X)
return sample_weight return sample_weight
try:
from sklearn.utils.validation import validate_data
except ImportError:
# validate_data() was added in scikit-learn 1.6, this function roughly imitates it for older versions.
# It can be removed when lightgbm's minimum scikit-learn version is at least 1.6.
def validate_data(
_estimator,
X,
y="no_validation",
accept_sparse: bool = True,
# 'force_all_finite' was renamed to 'ensure_all_finite' in scikit-learn 1.6
ensure_all_finite: bool = False,
ensure_min_samples: int = 1,
# trap other keyword arguments that only work on scikit-learn >=1.6, like 'reset'
**ignored_kwargs,
):
# it's safe to import _num_features unconditionally because:
#
# * it was first added in scikit-learn 0.24.2
# * lightgbm cannot be used with scikit-learn versions older than that
# * this validate_data() re-implementation will not be called in scikit-learn>=1.6
#
from sklearn.utils.validation import _num_features
# _num_features() raises a TypeError on 1-dimensional input. That's a problem
# because scikit-learn's 'check_fit1d' estimator check sets that expectation that
# estimators must raise a ValueError when a 1-dimensional input is passed to fit().
#
# So here, lightgbm avoids calling _num_features() on 1-dimensional inputs.
if hasattr(X, "shape") and len(X.shape) == 1:
n_features_in_ = 1
else:
n_features_in_ = _num_features(X)
no_val_y = isinstance(y, str) and y == "no_validation"
# NOTE: check_X_y() calls check_array() internally, so only need to call one or the other of them here
if no_val_y:
X = check_array(
X,
accept_sparse=accept_sparse,
force_all_finite=ensure_all_finite,
ensure_min_samples=ensure_min_samples,
)
else:
X, y = check_X_y(
X,
y,
accept_sparse=accept_sparse,
force_all_finite=ensure_all_finite,
ensure_min_samples=ensure_min_samples,
)
# this only needs to be updated at fit() time
_estimator.n_features_in_ = n_features_in_
# raise the same error that scikit-learn's `validate_data()` does on scikit-learn>=1.6
if _estimator.__sklearn_is_fitted__() and _estimator._n_features != n_features_in_:
raise ValueError(
f"X has {n_features_in_} features, but {_estimator.__class__.__name__} "
f"is expecting {_estimator._n_features} features as input."
)
if no_val_y:
return X
else:
return X, y
SKLEARN_INSTALLED = True SKLEARN_INSTALLED = True
_LGBMBaseCrossValidator = BaseCrossValidator _LGBMBaseCrossValidator = BaseCrossValidator
_LGBMModelBase = BaseEstimator _LGBMModelBase = BaseEstimator
...@@ -38,12 +107,11 @@ try: ...@@ -38,12 +107,11 @@ try:
LGBMNotFittedError = NotFittedError LGBMNotFittedError = NotFittedError
_LGBMStratifiedKFold = StratifiedKFold _LGBMStratifiedKFold = StratifiedKFold
_LGBMGroupKFold = GroupKFold _LGBMGroupKFold = GroupKFold
_LGBMCheckXY = check_X_y
_LGBMCheckArray = check_array
_LGBMCheckSampleWeight = _check_sample_weight _LGBMCheckSampleWeight = _check_sample_weight
_LGBMAssertAllFinite = assert_all_finite _LGBMAssertAllFinite = assert_all_finite
_LGBMCheckClassificationTargets = check_classification_targets _LGBMCheckClassificationTargets = check_classification_targets
_LGBMComputeSampleWeight = compute_sample_weight _LGBMComputeSampleWeight = compute_sample_weight
_LGBMValidateData = validate_data
except ImportError: except ImportError:
SKLEARN_INSTALLED = False SKLEARN_INSTALLED = False
...@@ -67,12 +135,22 @@ except ImportError: ...@@ -67,12 +135,22 @@ except ImportError:
LGBMNotFittedError = ValueError LGBMNotFittedError = ValueError
_LGBMStratifiedKFold = None _LGBMStratifiedKFold = None
_LGBMGroupKFold = None _LGBMGroupKFold = None
_LGBMCheckXY = None
_LGBMCheckArray = None
_LGBMCheckSampleWeight = None _LGBMCheckSampleWeight = None
_LGBMAssertAllFinite = None _LGBMAssertAllFinite = None
_LGBMCheckClassificationTargets = None _LGBMCheckClassificationTargets = None
_LGBMComputeSampleWeight = None _LGBMComputeSampleWeight = None
_LGBMValidateData = None
_sklearn_version = None
# additional scikit-learn imports only for type hints
if TYPE_CHECKING:
# sklearn.utils.Tags can be imported unconditionally once
# lightgbm's minimum scikit-learn version is 1.6 or higher
try:
from sklearn.utils import Tags as _sklearn_Tags
except ImportError:
_sklearn_Tags = None
"""pandas""" """pandas"""
try: try:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import copy import copy
from inspect import signature from inspect import signature
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
...@@ -31,21 +31,25 @@ from .compat import ( ...@@ -31,21 +31,25 @@ from .compat import (
SKLEARN_INSTALLED, SKLEARN_INSTALLED,
LGBMNotFittedError, LGBMNotFittedError,
_LGBMAssertAllFinite, _LGBMAssertAllFinite,
_LGBMCheckArray,
_LGBMCheckClassificationTargets, _LGBMCheckClassificationTargets,
_LGBMCheckSampleWeight, _LGBMCheckSampleWeight,
_LGBMCheckXY,
_LGBMClassifierBase, _LGBMClassifierBase,
_LGBMComputeSampleWeight, _LGBMComputeSampleWeight,
_LGBMCpuCount, _LGBMCpuCount,
_LGBMLabelEncoder, _LGBMLabelEncoder,
_LGBMModelBase, _LGBMModelBase,
_LGBMRegressorBase, _LGBMRegressorBase,
_LGBMValidateData,
_sklearn_version,
dt_DataTable, dt_DataTable,
pd_DataFrame, pd_DataFrame,
) )
from .engine import train from .engine import train
if TYPE_CHECKING:
from .compat import _sklearn_Tags
__all__ = [ __all__ = [
"LGBMClassifier", "LGBMClassifier",
"LGBMModel", "LGBMModel",
...@@ -662,6 +666,10 @@ class LGBMModel(_LGBMModelBase): ...@@ -662,6 +666,10 @@ class LGBMModel(_LGBMModelBase):
self._n_classes: int = -1 self._n_classes: int = -1
self.set_params(**kwargs) self.set_params(**kwargs)
# scikit-learn 1.6 introduced an __sklearn__tags() method intended to replace _more_tags().
# _more_tags() can be removed whenever lightgbm's minimum supported scikit-learn version
# is >=1.6.
# ref: https://github.com/microsoft/LightGBM/pull/6651
def _more_tags(self) -> Dict[str, Any]: def _more_tags(self) -> Dict[str, Any]:
return { return {
"allow_nan": True, "allow_nan": True,
...@@ -669,10 +677,46 @@ class LGBMModel(_LGBMModelBase): ...@@ -669,10 +677,46 @@ class LGBMModel(_LGBMModelBase):
"_xfail_checks": { "_xfail_checks": {
"check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes " "check_no_attributes_set_in_init": "scikit-learn incorrectly asserts that private attributes "
"cannot be set in __init__: " "cannot be set in __init__: "
"(see https://github.com/microsoft/LightGBM/issues/2628)" "(see https://github.com/microsoft/LightGBM/issues/2628)",
}, },
} }
@staticmethod
def _update_sklearn_tags_from_dict(
*,
tags: "_sklearn_Tags",
tags_dict: Dict[str, Any],
) -> "_sklearn_Tags":
"""Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes.
``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags.
ref: https://github.com/scikit-learn/scikit-learn/pull/29677
This method handles updating that instance based on the value in ``self._more_tags()``.
"""
tags.input_tags.allow_nan = tags_dict["allow_nan"]
tags.input_tags.sparse = "sparse" in tags_dict["X_types"]
tags.target_tags.one_d_labels = "1dlabels" in tags_dict["X_types"]
tags._xfail_checks = tags_dict["_xfail_checks"]
return tags
def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
# _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally,
# because that method isn't defined for scikit-learn<1.6
if not hasattr(_LGBMModelBase, "__sklearn_tags__"):
err_msg = (
"__sklearn_tags__() should not be called when using scikit-learn<1.6. "
f"Detected version: {_sklearn_version}"
)
raise AttributeError(err_msg)
# take whatever tags are provided by BaseEstimator, then modify
# them with LightGBM-specific values
return self._update_sklearn_tags_from_dict(
tags=_LGBMModelBase.__sklearn_tags__(self),
tags_dict=self._more_tags(),
)
def __sklearn_is_fitted__(self) -> bool: def __sklearn_is_fitted__(self) -> bool:
return getattr(self, "fitted_", False) return getattr(self, "fitted_", False)
...@@ -862,12 +906,26 @@ class LGBMModel(_LGBMModelBase): ...@@ -862,12 +906,26 @@ class LGBMModel(_LGBMModelBase):
params["metric"] = [metric for metric in params["metric"] if metric is not None] params["metric"] = [metric for metric in params["metric"] if metric is not None]
if not isinstance(X, (pd_DataFrame, dt_DataTable)): if not isinstance(X, (pd_DataFrame, dt_DataTable)):
_X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2) _X, _y = _LGBMValidateData(
self,
X,
y,
reset=True,
# allow any input type (this validation is done further down, in lgb.Dataset())
accept_sparse=True,
# do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
ensure_all_finite=False,
# raise an error on 0-row and 1-row inputs
ensure_min_samples=2,
)
if sample_weight is not None: if sample_weight is not None:
sample_weight = _LGBMCheckSampleWeight(sample_weight, _X) sample_weight = _LGBMCheckSampleWeight(sample_weight, _X)
else: else:
_X, _y = X, y _X, _y = X, y
# for other data types, setting n_features_in_ is handled by _LGBMValidateData() in the branch above
self.n_features_in_ = _X.shape[1]
if self._class_weight is None: if self._class_weight is None:
self._class_weight = self.class_weight self._class_weight = self.class_weight
if self._class_weight is not None: if self._class_weight is not None:
...@@ -877,10 +935,6 @@ class LGBMModel(_LGBMModelBase): ...@@ -877,10 +935,6 @@ class LGBMModel(_LGBMModelBase):
else: else:
sample_weight = np.multiply(sample_weight, class_sample_weight) sample_weight = np.multiply(sample_weight, class_sample_weight)
self._n_features = _X.shape[1]
# copy for consistency
self._n_features_in = self._n_features
train_set = Dataset( train_set = Dataset(
data=_X, data=_X,
label=_y, label=_y,
...@@ -963,6 +1017,13 @@ class LGBMModel(_LGBMModelBase): ...@@ -963,6 +1017,13 @@ class LGBMModel(_LGBMModelBase):
callbacks=callbacks, callbacks=callbacks,
) )
# This populates the property self.n_features_, the number of features in the fitted model,
# and so should only be set after fitting.
#
# The related property self._n_features_in, which populates self.n_features_in_,
# is set BEFORE fitting.
self._n_features = self._Booster.num_feature()
self._evals_result = evals_result self._evals_result = evals_result
self._best_iteration = self._Booster.best_iteration self._best_iteration = self._Booster.best_iteration
self._best_score = self._Booster.best_score self._best_score = self._Booster.best_score
...@@ -1004,13 +1065,20 @@ class LGBMModel(_LGBMModelBase): ...@@ -1004,13 +1065,20 @@ class LGBMModel(_LGBMModelBase):
if not self.__sklearn_is_fitted__(): if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.") raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
if not isinstance(X, (pd_DataFrame, dt_DataTable)): if not isinstance(X, (pd_DataFrame, dt_DataTable)):
X = _LGBMCheckArray(X, accept_sparse=True, force_all_finite=False) X = _LGBMValidateData(
n_features = X.shape[1] self,
if self._n_features != n_features: X,
raise ValueError( # 'y' being omitted = run scikit-learn's check_array() instead of check_X_y()
"Number of features of the model must " #
f"match the input. Model n_features_ is {self._n_features} and " # Prevent scikit-learn from deleting or modifying attributes like 'feature_names_in_' and 'n_features_in_'.
f"input n_features is {n_features}" # These shouldn't be changed at predict() time.
reset=False,
# allow any input type (this validation is done further down, in lgb.Dataset())
accept_sparse=True,
# do not raise an error if Inf of NaN values are found (LightGBM handles these internally)
ensure_all_finite=False,
# raise an error on 0-row inputs
ensure_min_samples=1,
) )
# retrieve original params that possibly can be used in both training and prediction # retrieve original params that possibly can be used in both training and prediction
# and then overwrite them (considering aliases) with params that were passed directly in prediction # and then overwrite them (considering aliases) with params that were passed directly in prediction
...@@ -1067,6 +1135,21 @@ class LGBMModel(_LGBMModelBase): ...@@ -1067,6 +1135,21 @@ class LGBMModel(_LGBMModelBase):
raise LGBMNotFittedError("No n_features_in found. Need to call fit beforehand.") raise LGBMNotFittedError("No n_features_in found. Need to call fit beforehand.")
return self._n_features_in return self._n_features_in
@n_features_in_.setter
def n_features_in_(self, value: int) -> None:
"""Set number of features found in passed-in dataset.
Starting with ``scikit-learn`` 1.6, ``scikit-learn`` expects to be able to directly
set this property in functions like ``validate_data()``.
.. note::
Do not call ``estimator.n_features_in_ = some_int`` or anything else that invokes
this method. It is only here for compatibility with ``scikit-learn`` validation
functions used internally in ``lightgbm``.
"""
self._n_features_in = value
@property @property
def best_score_(self) -> _LGBM_BoosterBestScoreType: def best_score_(self) -> _LGBM_BoosterBestScoreType:
""":obj:`dict`: The best score of fitted model.""" """:obj:`dict`: The best score of fitted model."""
...@@ -1165,10 +1248,45 @@ class LGBMModel(_LGBMModelBase): ...@@ -1165,10 +1248,45 @@ class LGBMModel(_LGBMModelBase):
raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.") raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
return np.array(self.feature_name_) return np.array(self.feature_name_)
@feature_names_in_.deleter
def feature_names_in_(self) -> None:
"""Intercept calls to delete ``feature_names_in_``.
Some code paths in ``scikit-learn`` try to delete the ``feature_names_in_`` attribute
on estimators when a new training dataset that doesn't have features is passed.
LightGBM automatically assigns feature names to such datasets
(like ``Column_0``, ``Column_1``, etc.) and so does not want that behavior.
However, that behavior is coupled to ``scikit-learn`` automatically updating
``n_features_in_`` in those same code paths, which is necessary for compliance
with its API (via argument ``reset`` to functions like ``validate_data()`` and
``check_array()``).
.. note::
Do not call ``del estimator.feature_names_in_`` or anything else that invokes
this method. It is only here for compatibility with ``scikit-learn`` validation
functions used internally in ``lightgbm``.
"""
pass
class LGBMRegressor(_LGBMRegressorBase, LGBMModel): class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
"""LightGBM regressor.""" """LightGBM regressor."""
def _more_tags(self) -> Dict[str, Any]:
# handle the case where RegressorMixin possibly provides _more_tags()
if callable(getattr(_LGBMRegressorBase, "_more_tags", None)):
tags = _LGBMRegressorBase._more_tags(self)
else:
tags = {}
# override those with LightGBM-specific preferences
tags.update(LGBMModel._more_tags(self))
return tags
def __sklearn_tags__(self) -> "_sklearn_Tags":
return LGBMModel.__sklearn_tags__(self)
def fit( # type: ignore[override] def fit( # type: ignore[override]
self, self,
X: _LGBM_ScikitMatrixLike, X: _LGBM_ScikitMatrixLike,
...@@ -1215,6 +1333,19 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): ...@@ -1215,6 +1333,19 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
class LGBMClassifier(_LGBMClassifierBase, LGBMModel): class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
"""LightGBM classifier.""" """LightGBM classifier."""
def _more_tags(self) -> Dict[str, Any]:
# handle the case where ClassifierMixin possibly provides _more_tags()
if callable(getattr(_LGBMClassifierBase, "_more_tags", None)):
tags = _LGBMClassifierBase._more_tags(self)
else:
tags = {}
# override those with LightGBM-specific preferences
tags.update(LGBMModel._more_tags(self))
return tags
def __sklearn_tags__(self) -> "_sklearn_Tags":
return LGBMModel.__sklearn_tags__(self)
def fit( # type: ignore[override] def fit( # type: ignore[override]
self, self,
X: _LGBM_ScikitMatrixLike, X: _LGBM_ScikitMatrixLike,
......
...@@ -45,7 +45,7 @@ pandas = [ ...@@ -45,7 +45,7 @@ pandas = [
"pandas>=0.24.0" "pandas>=0.24.0"
] ]
scikit-learn = [ scikit-learn = [
"scikit-learn!=0.22.0" "scikit-learn>=0.24.2"
] ]
[project.urls] [project.urls]
......
...@@ -36,12 +36,14 @@ from .utils import ( ...@@ -36,12 +36,14 @@ from .utils import (
) )
decreasing_generator = itertools.count(0, -1) decreasing_generator = itertools.count(0, -1)
estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker)
task_to_model_factory = { task_to_model_factory = {
"ranking": lgb.LGBMRanker, "ranking": lgb.LGBMRanker,
"binary-classification": lgb.LGBMClassifier, "binary-classification": lgb.LGBMClassifier,
"multiclass-classification": lgb.LGBMClassifier, "multiclass-classification": lgb.LGBMClassifier,
"regression": lgb.LGBMRegressor, "regression": lgb.LGBMRegressor,
} }
all_tasks = tuple(task_to_model_factory.keys())
def _create_data(task, n_samples=100, n_features=4): def _create_data(task, n_samples=100, n_features=4):
...@@ -1311,7 +1313,7 @@ def test_check_is_fitted(): ...@@ -1311,7 +1313,7 @@ def test_check_is_fitted():
check_is_fitted(model) check_is_fitted(model)
@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) @pytest.mark.parametrize("estimator_class", estimator_classes)
@pytest.mark.parametrize("max_depth", [3, 4, 5, 8]) @pytest.mark.parametrize("max_depth", [3, 4, 5, 8])
def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth): def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2) X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
...@@ -1390,7 +1392,7 @@ def test_fit_only_raises_num_rounds_warning_when_expected(capsys): ...@@ -1390,7 +1392,7 @@ def test_fit_only_raises_num_rounds_warning_when_expected(capsys):
assert_silent(capsys) assert_silent(capsys)
@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) @pytest.mark.parametrize("estimator_class", estimator_classes)
def test_getting_feature_names_in_np_input(estimator_class): def test_getting_feature_names_in_np_input(estimator_class):
# input is a numpy array, which doesn't have feature names. LightGBM adds # input is a numpy array, which doesn't have feature names. LightGBM adds
# feature names to the fitted model, which is inconsistent with sklearn's behavior # feature names to the fitted model, which is inconsistent with sklearn's behavior
...@@ -1409,7 +1411,7 @@ def test_getting_feature_names_in_np_input(estimator_class): ...@@ -1409,7 +1411,7 @@ def test_getting_feature_names_in_np_input(estimator_class):
np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])])) np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))
@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) @pytest.mark.parametrize("estimator_class", estimator_classes)
def test_getting_feature_names_in_pd_input(estimator_class): def test_getting_feature_names_in_pd_input(estimator_class):
X, y = load_digits(n_class=2, return_X_y=True, as_frame=True) X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
col_names = X.columns.to_list() col_names = X.columns.to_list()
...@@ -1436,7 +1438,29 @@ def test_sklearn_integration(estimator, check): ...@@ -1436,7 +1438,29 @@ def test_sklearn_integration(estimator, check):
check(estimator) check(estimator)
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"]) @pytest.mark.parametrize("estimator_class", estimator_classes)
def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimator_class):
est = estimator_class()
more_tags = est._more_tags()
err_msg = "List of supported X_types has changed. Update LGBMModel.__sklearn_tags__() to match."
assert more_tags["X_types"] == ["2darray", "sparse", "1dlabels"], err_msg
# the try-except part of this should be removed once lightgbm's
# minimum supported scikit-learn version is at least 1.6
try:
sklearn_tags = est.__sklearn_tags__()
except AttributeError as err:
# only the exact error we expected to be raised should be raised
assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err)))
else:
# if no AttributeError was thrown, we must be using scikit-learn>=1.6,
# and so the actual effects of __sklearn_tags__() should be tested
assert sklearn_tags.input_tags.allow_nan is True
assert sklearn_tags.input_tags.sparse is True
assert sklearn_tags.target_tags.one_d_labels is True
assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"]
@pytest.mark.parametrize("task", all_tasks)
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task): def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
pd = pytest.importorskip("pandas") pd = pytest.importorskip("pandas")
X, y, g = _create_data(task) X, y, g = _create_data(task)
...@@ -1540,7 +1564,7 @@ def test_default_n_jobs(tmp_path): ...@@ -1540,7 +1564,7 @@ def test_default_n_jobs(tmp_path):
@pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed") @pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"]) @pytest.mark.parametrize("task", all_tasks)
def test_validate_features(task): def test_validate_features(task):
X, y, g = _create_data(task, n_features=4) X, y, g = _create_data(task, n_features=4)
features = ["x1", "x2", "x3", "x4"] features = ["x1", "x2", "x3", "x4"]
...@@ -1561,6 +1585,52 @@ def test_validate_features(task): ...@@ -1561,6 +1585,52 @@ def test_validate_features(task):
model.predict(df2, validate_features=False) model.predict(df2, validate_features=False)
# LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by
# its scikit-learn estimators, for consistency with scikit-learn's own behavior.
@pytest.mark.parametrize("task", all_tasks)
@pytest.mark.parametrize("predict_disable_shape_check", [True, False])
def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check, task):
X, y, g = _create_data(task, n_features=4)
model_factory = task_to_model_factory[task]
fit_kwargs = {"X": X[:, :-1], "y": y}
if task == "ranking":
estimator_name = "LGBMRanker"
fit_kwargs.update({"group": g})
elif task == "regression":
estimator_name = "LGBMRegressor"
else:
estimator_name = "LGBMClassifier"
# train on the first 3 features
model = model_factory(n_estimators=5, num_leaves=7, verbose=-1).fit(**fit_kwargs)
# more cols in X than features: error
err_msg = f"X has 4 features, but {estimator_name} is expecting 3 features as input"
with pytest.raises(ValueError, match=err_msg):
model.predict(X, predict_disable_shape_check=predict_disable_shape_check)
if estimator_name == "LGBMClassifier":
with pytest.raises(ValueError, match=err_msg):
model.predict_proba(X, predict_disable_shape_check=predict_disable_shape_check)
# fewer cols in X than features: error
err_msg = f"X has 2 features, but {estimator_name} is expecting 3 features as input"
with pytest.raises(ValueError, match=err_msg):
model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)
if estimator_name == "LGBMClassifier":
with pytest.raises(ValueError, match=err_msg):
model.predict_proba(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)
# same number of columns in both: no error
preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
assert preds.shape == y.shape
if estimator_name == "LGBMClassifier":
preds = model.predict_proba(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
assert preds.shape[0] == y.shape[0]
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"]) @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"]) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"]) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment