Unverified Commit 72a39817 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python-package][sklearn] Support PyArrow Table as an input in scikit-learn methods (#6910)

parent c58320fc
......@@ -269,6 +269,7 @@ try:
from pyarrow import Array as pa_Array
from pyarrow import ChunkedArray as pa_ChunkedArray
from pyarrow import Table as pa_Table
from pyarrow import array as pa_array
from pyarrow import chunked_array as pa_chunked_array
from pyarrow.types import is_boolean as arrow_is_boolean
from pyarrow.types import is_floating as arrow_is_floating
......@@ -302,6 +303,7 @@ except ImportError:
all = None
equal = None
pa_array = None
pa_chunked_array = None
arrow_is_boolean = None
arrow_is_integer = None
......
......@@ -41,6 +41,7 @@ from .compat import (
_LGBMRegressorBase,
_LGBMValidateData,
_sklearn_version,
pa_Table,
pd_DataFrame,
)
from .engine import train
......@@ -60,6 +61,7 @@ _LGBM_ScikitMatrixLike = Union[
List[Union[List[float], List[int]]],
np.ndarray,
pd_DataFrame,
pa_Table,
scipy.sparse.spmatrix,
]
_LGBM_ScikitCustomObjectiveFunction = Union[
......@@ -943,7 +945,7 @@ class LGBMModel(_LGBMModelBase):
params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
params["metric"] = [metric for metric in params["metric"] if metric is not None]
if not isinstance(X, pd_DataFrame):
if not isinstance(X, (pd_DataFrame, pa_Table)):
_X, _y = _LGBMValidateData(
self,
X,
......@@ -1075,7 +1077,7 @@ class LGBMModel(_LGBMModelBase):
fit.__doc__ = (
_lgbmmodel_doc_fit.format(
X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
X_shape="numpy array, pandas DataFrame, pyarrow Table, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float, pyarrow Array, pyarrow ChunkedArray of shape = [n_samples]",
sample_weight_shape="numpy array, pandas Series, list of int or float, pyarrow Array, pyarrow ChunkedArray of shape = [n_samples] or None, optional (default=None)",
init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float, list of lists, pyarrow Array, pyarrow ChunkedArray, pyarrow Table of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
......@@ -1102,7 +1104,7 @@ class LGBMModel(_LGBMModelBase):
"""Docstring is set after definition, using a template."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
if not isinstance(X, pd_DataFrame):
if not isinstance(X, (pd_DataFrame, pa_Table)):
X = _LGBMValidateData(
self,
X,
......
......@@ -25,7 +25,11 @@ import lightgbm as lgb
from lightgbm.compat import (
DASK_INSTALLED,
PANDAS_INSTALLED,
PYARROW_INSTALLED,
_sklearn_version,
pa_array,
pa_chunked_array,
pa_Table,
pd_DataFrame,
pd_Series,
)
......@@ -54,6 +58,9 @@ task_to_model_factory = {
"regression": lgb.LGBMRegressor,
}
all_tasks = tuple(task_to_model_factory.keys())
all_x_types = ("list2d", "numpy", "pd_DataFrame", "pa_Table", "scipy_csc", "scipy_csr")
all_y_types = ("list1d", "numpy", "pd_Series", "pd_DataFrame", "pa_Array", "pa_ChunkedArray")
all_group_types = ("list1d_float", "list1d_int", "numpy", "pd_Series", "pa_Array", "pa_ChunkedArray")
def _create_data(task, n_samples=100, n_features=4):
......@@ -1884,16 +1891,11 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
assert preds.shape[0] == y.shape[0]
@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
pytest.skip("pandas is not installed")
def run_minimal_test(X_type, y_type, g_type, task, rng):
X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
if task == "binary-classification" or task == "regression":
if task in {"binary-classification", "regression", "ranking"}:
init_score = np.full_like(y, np.mean(y))
elif task == "multiclass-classification":
init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
......@@ -1909,6 +1911,8 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
X = scipy.sparse.csr_matrix(X)
elif X_type == "pd_DataFrame":
X = pd_DataFrame(X)
elif X_type == "pa_Table":
X = pa_Table.from_pandas(pd_DataFrame(X))
elif X_type != "numpy":
raise ValueError(f"Unrecognized X_type: '{X_type}'")
......@@ -1932,19 +1936,50 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
init_score = pd_DataFrame(init_score)
else:
init_score = pd_Series(init_score)
elif y_type == "pa_Array":
y = pa_array(y)
weights = pa_array(weights)
if task == "multiclass-classification":
init_score = pa_Table.from_pandas(pd_DataFrame(init_score))
else:
init_score = pa_array(init_score)
elif y_type == "pa_ChunkedArray":
y = pa_chunked_array([y])
weights = pa_chunked_array([weights])
if task == "multiclass-classification":
init_score = pa_Table.from_pandas(pd_DataFrame(init_score))
else:
init_score = pa_chunked_array([init_score])
elif y_type != "numpy":
raise ValueError(f"Unrecognized y_type: '{y_type}'")
if g_type == "list1d_float":
g = g.astype("float").tolist()
elif g_type == "list1d_int":
g = g.astype("int").tolist()
elif g_type == "pd_Series":
g = pd_Series(g)
elif g_type == "pa_Array":
g = pa_array(g)
elif g_type == "pa_ChunkedArray":
g = pa_chunked_array([g])
elif g_type != "numpy":
raise ValueError(f"Unrecognized g_type: '{g_type}'")
model = task_to_model_factory[task](n_estimators=10, verbose=-1)
model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score],
)
params_fit = {
"X": X,
"y": y,
"sample_weight": weights,
"init_score": init_score,
"eval_set": [(X_valid, y)],
"eval_sample_weight": [weights],
"eval_init_score": [init_score],
}
if task == "ranking":
params_fit["group"] = g
params_fit["eval_group"] = [g]
model.fit(**params_fit)
preds = model.predict(X)
if task == "binary-classification":
......@@ -1953,72 +1988,44 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
assert accuracy_score(y, preds) >= 0.99
elif task == "regression":
assert r2_score(y, preds) > 0.86
elif task == "ranking":
assert spearmanr(preds, y).correlation >= 0.99
else:
raise ValueError(f"Unrecognized task: '{task}'")
@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
@pytest.mark.parametrize("X_type", all_x_types)
@pytest.mark.parametrize("y_type", all_y_types)
@pytest.mark.parametrize("task", [t for t in all_tasks if t != "ranking"])
def test_classification_and_regression_minimally_work_with_all_accepted_data_types(
X_type,
y_type,
task,
rng,
):
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
pytest.skip("pandas is not installed")
X, y, g = _create_data(task="ranking", n_samples=1_000)
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
init_score = np.full_like(y, np.mean(y))
X_valid = X * 2
if any(t.startswith("pa_") for t in [X_type, y_type]) and not PYARROW_INSTALLED:
pytest.skip("pyarrow is not installed")
if X_type == "list2d":
X = X.tolist()
elif X_type == "scipy_csc":
X = scipy.sparse.csc_matrix(X)
elif X_type == "scipy_csr":
X = scipy.sparse.csr_matrix(X)
elif X_type == "pd_DataFrame":
X = pd_DataFrame(X)
elif X_type != "numpy":
raise ValueError(f"Unrecognized X_type: '{X_type}'")
run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng)
# make weights and init_score same types as y, just to avoid
# a huge number of combinations and therefore test cases
if y_type == "list1d":
y = y.tolist()
weights = weights.tolist()
init_score = init_score.tolist()
elif y_type == "pd_DataFrame":
y = pd_DataFrame(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type == "pd_Series":
y = pd_Series(y)
weights = pd_Series(weights)
init_score = pd_Series(init_score)
elif y_type != "numpy":
raise ValueError(f"Unrecognized y_type: '{y_type}'")
if g_type == "list1d_float":
g = g.astype("float").tolist()
elif g_type == "list1d_int":
g = g.astype("int").tolist()
elif g_type == "pd_Series":
g = pd_Series(g)
elif g_type != "numpy":
raise ValueError(f"Unrecognized g_type: '{g_type}'")
@pytest.mark.parametrize("X_type", all_x_types)
@pytest.mark.parametrize("y_type", all_y_types)
@pytest.mark.parametrize("g_type", all_group_types)
def test_ranking_minimally_works_with_all_accepted_data_types(
X_type,
y_type,
g_type,
rng,
):
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
pytest.skip("pandas is not installed")
if any(t.startswith("pa_") for t in [X_type, y_type, g_type]) and not PYARROW_INSTALLED:
pytest.skip("pyarrow is not installed")
model = task_to_model_factory["ranking"](n_estimators=10, verbose=-1)
model.fit(
X=X,
y=y,
sample_weight=weights,
init_score=init_score,
group=g,
eval_set=[(X_valid, y)],
eval_sample_weight=[weights],
eval_init_score=[init_score],
eval_group=[g],
)
preds = model.predict(X)
assert spearmanr(preds, y).correlation >= 0.99
run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng)
def test_classifier_fit_detects_classes_every_time():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment