[python-package][sklearn] Support PyArrow Table as an input in scikit-learn methods (#6910)

72a39817 · Nikita Titov · GitHub · c58320fc · 72a39817 · 72a39817
Unverified Commit 72a39817 authored May 27, 2025 by Nikita Titov Committed by GitHub May 26, 2025
3 changed files
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -269,6 +269,7 @@ try:
    from pyarrow import Array as pa_Array
    from pyarrow import ChunkedArray as pa_ChunkedArray
    from pyarrow import Table as pa_Table
+    from pyarrow import array as pa_array
    from pyarrow import chunked_array as pa_chunked_array
    from pyarrow.types import is_boolean as arrow_is_boolean
    from pyarrow.types import is_floating as arrow_is_floating
@@ -302,6 +303,7 @@ except ImportError:
        all = None
        equal = None

+    pa_array = None
    pa_chunked_array = None
    arrow_is_boolean = None
    arrow_is_integer = None

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -41,6 +41,7 @@ from .compat import (
    _LGBMRegressorBase,
    _LGBMValidateData,
    _sklearn_version,
+    pa_Table,
    pd_DataFrame,
 )
 from .engine import train
@@ -60,6 +61,7 @@ _LGBM_ScikitMatrixLike = Union[
    List[Union[List[float], List[int]]],
    np.ndarray,
    pd_DataFrame,
+    pa_Table,
    scipy.sparse.spmatrix,
 ]
 _LGBM_ScikitCustomObjectiveFunction = Union[
@@ -943,7 +945,7 @@ class LGBMModel(_LGBMModelBase):
        params["metric"] = [e for e in eval_metrics_builtin if e not in params["metric"]] + params["metric"]
        params["metric"] = [metric for metric in params["metric"] if metric is not None]

-        if not isinstance(X, pd_DataFrame):
+        if not isinstance(X, (pd_DataFrame, pa_Table)):
            _X, _y = _LGBMValidateData(
                self,
                X,
@@ -1075,7 +1077,7 @@ class LGBMModel(_LGBMModelBase):

    fit.__doc__ = (
        _lgbmmodel_doc_fit.format(
-            X_shape="numpy array, pandas DataFrame, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+            X_shape="numpy array, pandas DataFrame, pyarrow Table, scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
            y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float, pyarrow Array, pyarrow ChunkedArray of shape = [n_samples]",
            sample_weight_shape="numpy array, pandas Series, list of int or float, pyarrow Array, pyarrow ChunkedArray of shape = [n_samples] or None, optional (default=None)",
            init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float, list of lists, pyarrow Array, pyarrow ChunkedArray, pyarrow Table of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
@@ -1102,7 +1104,7 @@ class LGBMModel(_LGBMModelBase):
        """Docstring is set after definition, using a template."""
        if not self.__sklearn_is_fitted__():
            raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
-        if not isinstance(X, pd_DataFrame):
+        if not isinstance(X, (pd_DataFrame, pa_Table)):
            X = _LGBMValidateData(
                self,
                X,

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -25,7 +25,11 @@ import lightgbm as lgb
 from lightgbm.compat import (
    DASK_INSTALLED,
    PANDAS_INSTALLED,
+    PYARROW_INSTALLED,
    _sklearn_version,
+    pa_array,
+    pa_chunked_array,
+    pa_Table,
    pd_DataFrame,
    pd_Series,
 )
@@ -54,6 +58,9 @@ task_to_model_factory = {
    "regression": lgb.LGBMRegressor,
 }
 all_tasks = tuple(task_to_model_factory.keys())
+all_x_types = ("list2d", "numpy", "pd_DataFrame", "pa_Table", "scipy_csc", "scipy_csr")
+all_y_types = ("list1d", "numpy", "pd_Series", "pd_DataFrame", "pa_Array", "pa_ChunkedArray")
+all_group_types = ("list1d_float", "list1d_int", "numpy", "pd_Series", "pa_Array", "pa_ChunkedArray")


 def _create_data(task, n_samples=100, n_features=4):
@@ -1884,16 +1891,11 @@ def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disabl
        assert preds.shape[0] == y.shape[0]


-@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
-@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
-@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
-def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
-    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
-        pytest.skip("pandas is not installed")
+def run_minimal_test(X_type, y_type, g_type, task, rng):
    X, y, g = _create_data(task, n_samples=2_000)
    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))

-    if task == "binary-classification" or task == "regression":
+    if task in {"binary-classification", "regression", "ranking"}:
        init_score = np.full_like(y, np.mean(y))
    elif task == "multiclass-classification":
        init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
@@ -1909,6 +1911,8 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
        X = scipy.sparse.csr_matrix(X)
    elif X_type == "pd_DataFrame":
        X = pd_DataFrame(X)
+    elif X_type == "pa_Table":
+        X = pa_Table.from_pandas(pd_DataFrame(X))
    elif X_type != "numpy":
        raise ValueError(f"Unrecognized X_type: '{X_type}'")

@@ -1932,19 +1936,50 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
            init_score = pd_DataFrame(init_score)
        else:
            init_score = pd_Series(init_score)
+    elif y_type == "pa_Array":
+        y = pa_array(y)
+        weights = pa_array(weights)
+        if task == "multiclass-classification":
+            init_score = pa_Table.from_pandas(pd_DataFrame(init_score))
+        else:
+            init_score = pa_array(init_score)
+    elif y_type == "pa_ChunkedArray":
+        y = pa_chunked_array([y])
+        weights = pa_chunked_array([weights])
+        if task == "multiclass-classification":
+            init_score = pa_Table.from_pandas(pd_DataFrame(init_score))
+        else:
+            init_score = pa_chunked_array([init_score])
    elif y_type != "numpy":
        raise ValueError(f"Unrecognized y_type: '{y_type}'")

+    if g_type == "list1d_float":
+        g = g.astype("float").tolist()
+    elif g_type == "list1d_int":
+        g = g.astype("int").tolist()
+    elif g_type == "pd_Series":
+        g = pd_Series(g)
+    elif g_type == "pa_Array":
+        g = pa_array(g)
+    elif g_type == "pa_ChunkedArray":
+        g = pa_chunked_array([g])
+    elif g_type != "numpy":
+        raise ValueError(f"Unrecognized g_type: '{g_type}'")
+
    model = task_to_model_factory[task](n_estimators=10, verbose=-1)
-    model.fit(
-        X=X,
-        y=y,
-        sample_weight=weights,
-        init_score=init_score,
-        eval_set=[(X_valid, y)],
-        eval_sample_weight=[weights],
-        eval_init_score=[init_score],
-    )
+    params_fit = {
+        "X": X,
+        "y": y,
+        "sample_weight": weights,
+        "init_score": init_score,
+        "eval_set": [(X_valid, y)],
+        "eval_sample_weight": [weights],
+        "eval_init_score": [init_score],
+    }
+    if task == "ranking":
+        params_fit["group"] = g
+        params_fit["eval_group"] = [g]
+    model.fit(**params_fit)

    preds = model.predict(X)
    if task == "binary-classification":
@@ -1953,72 +1988,44 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
        assert accuracy_score(y, preds) >= 0.99
    elif task == "regression":
        assert r2_score(y, preds) > 0.86
+    elif task == "ranking":
+        assert spearmanr(preds, y).correlation >= 0.99
    else:
        raise ValueError(f"Unrecognized task: '{task}'")


-@pytest.mark.parametrize("X_type", ["list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
-@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
-@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
-def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
-    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
+@pytest.mark.parametrize("X_type", all_x_types)
+@pytest.mark.parametrize("y_type", all_y_types)
+@pytest.mark.parametrize("task", [t for t in all_tasks if t != "ranking"])
+def test_classification_and_regression_minimally_work_with_all_accepted_data_types(
+    X_type,
+    y_type,
+    task,
+    rng,
+):
+    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
        pytest.skip("pandas is not installed")
-    X, y, g = _create_data(task="ranking", n_samples=1_000)
-    weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
-    init_score = np.full_like(y, np.mean(y))
-    X_valid = X * 2
+    if any(t.startswith("pa_") for t in [X_type, y_type]) and not PYARROW_INSTALLED:
+        pytest.skip("pyarrow is not installed")

-    if X_type == "list2d":
-        X = X.tolist()
-    elif X_type == "scipy_csc":
-        X = scipy.sparse.csc_matrix(X)
-    elif X_type == "scipy_csr":
-        X = scipy.sparse.csr_matrix(X)
-    elif X_type == "pd_DataFrame":
-        X = pd_DataFrame(X)
-    elif X_type != "numpy":
-        raise ValueError(f"Unrecognized X_type: '{X_type}'")
+    run_minimal_test(X_type=X_type, y_type=y_type, g_type="numpy", task=task, rng=rng)

-    # make weights and init_score same types as y, just to avoid
-    # a huge number of combinations and therefore test cases
-    if y_type == "list1d":
-        y = y.tolist()
-        weights = weights.tolist()
-        init_score = init_score.tolist()
-    elif y_type == "pd_DataFrame":
-        y = pd_DataFrame(y)
-        weights = pd_Series(weights)
-        init_score = pd_Series(init_score)
-    elif y_type == "pd_Series":
-        y = pd_Series(y)
-        weights = pd_Series(weights)
-        init_score = pd_Series(init_score)
-    elif y_type != "numpy":
-        raise ValueError(f"Unrecognized y_type: '{y_type}'")

-    if g_type == "list1d_float":
-        g = g.astype("float").tolist()
-    elif g_type == "list1d_int":
-        g = g.astype("int").tolist()
-    elif g_type == "pd_Series":
-        g = pd_Series(g)
-    elif g_type != "numpy":
-        raise ValueError(f"Unrecognized g_type: '{g_type}'")
+@pytest.mark.parametrize("X_type", all_x_types)
+@pytest.mark.parametrize("y_type", all_y_types)
+@pytest.mark.parametrize("g_type", all_group_types)
+def test_ranking_minimally_works_with_all_accepted_data_types(
+    X_type,
+    y_type,
+    g_type,
+    rng,
+):
+    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
+        pytest.skip("pandas is not installed")
+    if any(t.startswith("pa_") for t in [X_type, y_type, g_type]) and not PYARROW_INSTALLED:
+        pytest.skip("pyarrow is not installed")

-    model = task_to_model_factory["ranking"](n_estimators=10, verbose=-1)
-    model.fit(
-        X=X,
-        y=y,
-        sample_weight=weights,
-        init_score=init_score,
-        group=g,
-        eval_set=[(X_valid, y)],
-        eval_sample_weight=[weights],
-        eval_init_score=[init_score],
-        eval_group=[g],
-    )
-    preds = model.predict(X)
-    assert spearmanr(preds, y).correlation >= 0.99
+    run_minimal_test(X_type=X_type, y_type=y_type, g_type=g_type, task="ranking", rng=rng)


 def test_classifier_fit_detects_classes_every_time():